In [1]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from annoy import AnnoyIndex
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [2]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

Using TensorFlow backend.


In [3]:
MAX_SEQUENCE_LENGTH_T = 20 # 20
MAX_SEQUENCE_LENGTH_D = 20 # 80
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000

'''
    Configuration
'''
epochs = 100
best_loss = 1
best_epoch = 0
verbose = 0
loss = 1

In [4]:
# Domain to use
DOMAIN = 'eclipse'
'''
    propose_centroid_bert_
    propose_bert_triplet_
    propose_bert_
    baseline_dwen_
    baseline_
'''
METHOD = 'propose_bert_triplet_{}'.format(epochs)
EMBED_METHOD='bert'
ONLY_BUCKETS = False
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Path embeddings
EMBED_DIR='data/embed'
# Save model
SAVE_PATH = '{}_feature@number_of_epochs@epochs_64batch({})'.format(METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_feature_@number_of_epochs@epochs_64batch({})'.format(METHOD, DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

In [5]:
import os

pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [6]:
from keras_bert import load_vocabulary

token_dict = load_vocabulary(vocab_path)

In [7]:
token_dict['[CLS]'], token_dict['[SEP]']

(101, 102)

In [8]:
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 
                    token_dict['[CLS]'], token_dict['[SEP]'])
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [9]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=322339), HTML(value='')))




HBox(children=(IntProgress(value=0, max=39545), HTML(value='')))




In [10]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


361006

In [11]:
%%time

experiment.load_bugs()
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 31.8 s, sys: 3.7 s, total: 35.5 s
Wall time: 35.6 s


In [12]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=321536), HTML(value='')))




In [13]:
%%time

# path_train='train_chronological', path_test='test_chronological'
experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')
# Read and create the test queries duplicates
retrieval.create_queries()

Reading train data
Reading bug ids
CPU times: user 3min 33s, sys: 33.2 ms, total: 3min 34s
Wall time: 3min 34s


In [14]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

### Visualizing batchs

In [15]:
%%time

batch_size = 64
batch_size_test = 128

# we want a constant validation group to have a frame of reference for model performance
batch_triplets_valid, valid_input_sample, valid_input_pos, valid_input_neg, valid_sim = experiment.batch_iterator(None, 
                                                                                                      baseline.train_data, 
                                                                                                      baseline.dup_sets_train,
                                                                                                      bug_train_ids,
                                                                                                      batch_size_test, 1,
                                                                                                      issues_by_buckets)

CPU times: user 97.9 ms, sys: 3.97 ms, total: 102 ms
Wall time: 102 ms


#### Title

In [16]:
bug_set = baseline.bug_set


for i, (anchor, pos, neg) in enumerate(batch_triplets_valid):
    print("########### Batch #############")
    print("Title of anchor {}: {}".format(anchor, bug_set[anchor]['title_bert']))
    print("Title of pos {}: {}".format(pos, bug_set[pos]['title_bert']))
    print("Title of neg {}: {}".format(neg, bug_set[neg]['title_bert']))
    print("Title of anchor {}: {}".format(anchor, bug_set[anchor]['title_word_bert']))
    print("Title of pos {}: {}".format(pos, bug_set[pos]['title_word_bert']))
    print("Title of neg {}: {}".format(neg, bug_set[neg]['title_word_bert']))

########### Batch #############
Title of anchor 35944: [CLS] eclipse crashes on startup ( eclipse / jd ##t ) [SEP]
Title of pos 40728: [CLS] does not install on red ##hat or gen ##to ##o [SEP]
Title of neg 25092: [CLS] detect / warn on possible user ty ##po ##s [SEP]
Title of anchor 35944: [101, 13232, 19119, 2006, 22752, 1006, 13232, 1013, 26219, 2102, 1007, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Title of pos 40728: [101, 2515, 2025, 16500, 2006, 2417, 12707, 2030, 8991, 3406, 2080, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Title of neg 25092: [101, 11487, 1013, 11582, 2006, 2825, 5310, 5939, 6873, 2015, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
########### Batch #############
Title of anchor 30432: [CLS] need a way to des ##act ##ivate ct ##rl + mouse

In [17]:
bug_set = baseline.bug_set

anchor_neg_equal_desc = 0
pos_neg_equal_desc = 0
batch_equal_desc = 0
anchor_pos_equal_desc = 0

for i, (anchor, pos, neg) in enumerate(batch_triplets_valid):
    if np.array_equal(bug_set[anchor]['title_word_bert'], bug_set[neg]['title_word_bert']):
        anchor_neg_equal_desc +=1
    if np.array_equal(bug_set[pos]['title_word_bert'], bug_set[neg]['title_word_bert']):
        pos_neg_equal_desc +=1
    if np.array_equal(bug_set[anchor]['title_word_bert'], bug_set[pos]['title_word_bert']):
        anchor_pos_equal_desc +=1
    if np.array_equal(bug_set[anchor]['title_word_bert'], bug_set[pos]['title_word_bert']) and \
        np.array_equal(bug_set[anchor]['title_word_bert'], bug_set[neg]['title_word_bert']):
        batch_equal_desc +=1
        
print("batch_equal_title {}({:.2f}%)".format(batch_equal_desc, (batch_equal_desc / batch_size) * 100.0))
print("anchor_neg_equal_title {}({:.2f}%)".format(anchor_neg_equal_desc, (anchor_neg_equal_desc / batch_size) * 100.0))
print("pos_neg_equal_title {}({:.2f}%)".format(pos_neg_equal_desc, (pos_neg_equal_desc / batch_size) * 100.0))
print("anchor_pos_equal_tile {}({:.2f}%)".format(anchor_pos_equal_desc, (anchor_pos_equal_desc / batch_size) * 100.0))

batch_equal_title 0(0.00%)
anchor_neg_equal_title 0(0.00%)
pos_neg_equal_title 0(0.00%)
anchor_pos_equal_tile 3(4.69%)


#### Description

In [18]:
bug_set = baseline.bug_set

for i, (anchor, pos, neg) in enumerate(batch_triplets_valid):
    print("########### Batch #############")
    print("Description of anchor {}: {}".format(anchor, bug_set[anchor]['description_bert']))
    print("Description of pos {}: {}".format(pos, bug_set[pos]['description_bert']))
    print("Description of neg {}: {}".format(neg, bug_set[neg]['description_bert']))
    print("Description of anchor {}: {}".format(anchor, bug_set[anchor]['description_word_bert']))
    print("Description of pos {}: {}".format(pos, bug_set[pos]['description_word_bert']))
    print("Description of neg {}: {}".format(neg, bug_set[neg]['description_word_bert']))

########### Batch #############
Description of anchor 35944: [CLS] os : su ##se linux 8 . 0 , k ##de 3 . 1 downloaded from : eclipse . cis . sin ##ica . ed ##u . t ##w eclipse installation : jd ##t eclipse platform ( run ##time ) r - 2 . 1 - 2003 ##0 ##32 ##7 ##21 ##30 eclipse - platform - 2 . 1 - linux - motif . zip eclipse - jd ##t - 2 . 1 . zip ! session apr 01 , 2003 22 : 58 : 50 . 450 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - java . version = 1 . 4 . 1 _ 02 java . vendor = sun micro ##systems inc . boot ##load ##er constant ##s : os = linux , arch = x ##86 , w ##s = motif , nl = de _ de command - line arguments : - os linux - w ##s motif - arch x ##86 - install file : / us ##r / share / eclipse / ! entry org . eclipse . core . run ##time 2 1 apr 01 , 2003 22 : 58 : 50 . 45 ##2 ! message problems encountered loading the plug - in registry . ! sub ##ent ##ry 1 org . eclipse . core . run ##time 2 1 apr 01 , 2003 22 : 58 : 50 . 45 ##2 ! 

In [19]:
# bugs = {}
# orig ={}
# for bug_id in bug_set.keys():
#     bug = bug_set[bug_id]
#     size = 220
#     orig[bug['description'][:size]]=bug['description']
#     if bug['description'][:size] not in bugs:
#         bugs[bug['description'][:size]]=1
#     else: 
#         bugs[bug['description'][:size]]+=1

In [20]:
# for i in bugs.keys():
#     if bugs[i] > 10:
#         print(i, orig[i])

In [21]:
bug_set = baseline.bug_set

anchor_neg_equal_desc = 0
pos_neg_equal_desc = 0
batch_equal_desc = []
anchor_pos_equal_desc = 0

for i, (anchor, pos, neg) in enumerate(batch_triplets_valid):
    if np.array_equal(bug_set[anchor]['description_word_bert'], bug_set[neg]['description_word_bert']):
        anchor_neg_equal_desc +=1
    if np.array_equal(bug_set[pos]['description_word_bert'], bug_set[neg]['description_word_bert']):
        pos_neg_equal_desc +=1
    if np.array_equal(bug_set[anchor]['description_word_bert'], bug_set[pos]['description_word_bert']):
        anchor_pos_equal_desc +=1
    if np.array_equal(bug_set[anchor]['description_word_bert'], bug_set[pos]['description_word_bert']) and \
        np.array_equal(bug_set[anchor]['description_word_bert'], bug_set[neg]['description_word_bert']):
        batch_equal_desc.append([anchor, neg])
        
print("batch_equal_desc {}({:.2f}%)".format(len(batch_equal_desc), (len(batch_equal_desc) / batch_size) * 100.0))
print("anchor_neg_equal_desc {}({:.2f}%)".format(anchor_neg_equal_desc, (anchor_neg_equal_desc / batch_size) * 100.0))
print("pos_neg_equal_desc {}({:.2f}%)".format(pos_neg_equal_desc, (pos_neg_equal_desc / batch_size) * 100.0))
print("anchor_pos_equal_desc {}({:.2f}%)".format(anchor_pos_equal_desc, (anchor_pos_equal_desc / batch_size) * 100.0))

batch_equal_desc 0(0.00%)
anchor_neg_equal_desc 0(0.00%)
pos_neg_equal_desc 0(0.00%)
anchor_pos_equal_desc 4(6.25%)


### Bug description equals

In [22]:
bug_set = baseline.bug_set

for i, (anchor, neg) in enumerate(batch_equal_desc):
    print("########### Batch #############")
#     print("Description of anchor {}: {}".format(anchor, bug_set[anchor]))
#     print("Description of neg {}: {}".format(neg, bug_set[neg]))
    print("Description of anchor {}: {}".format(anchor, bug_set[anchor]['description_bert']))
    print("Description of neg {}: {}".format(neg, bug_set[neg]['description_bert']))
    print("Description of anchor {}: {}".format(anchor, bug_set[anchor]['description_word_bert']))
    print("Description of neg {}: {}".format(neg, bug_set[neg]['description_word_bert']))

### Model

In [32]:
from keras.models import load_model
from keras_bert import get_custom_objects

file_model = os.path.join("modelos", "model_{}.h5".format(SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs))))

model = load_model(file_model, custom_objects=get_custom_objects())

KeyboardInterrupt: 

In [None]:
#model.summary()

In [None]:
bug_set = retrieval.baseline.get_bug_set()

In [None]:
len(bug_set)

In [None]:
'''
    Experiment
'''
epochs = 10
batch_size =  64
batch_bugs = []
for epoch in range(epochs):
    batch_triplet_train, \
        train_input_sample, train_input_pos, train_input_neg, \
            train_sim = experiment.batch_iterator_bert(model, baseline.train_data, baseline.dup_sets_train, bug_train_ids, 
                                       batch_size, 1, issues_by_buckets, TRIPLET_HARD=True)
    batch_bugs.append(batch_triplet_train)
    print("Epoch {} ##########################".format(epoch+1))
    print(batch_triplet_train)
    

In [None]:
len(batch_bugs)

In [None]:
train_ids = list(baseline.train_data)

train_ids

In [None]:
baseline.dup_sets_train

In [None]:
buckets = retrieval.buckets

In [None]:
import random

def get_neg_bug_semihard(batch_dups, anchor, dups):
    negs = list(set(batch_dups) - set(dups))
    print("Total of negs {}".format(len(negs)))
    negs_invalids = [n for n in negs if n in dups]
    if len(negs_invalids) > 0:
        print("Negs of {}, {}".format(anchor, pos))
        print("Dups of {}: {}".format(anchor, dups))
        print("Contains {} negs invalids: {}".format(len(negs_invalids), negs_invalids))
        print("Negs:")
        print(negs)
    return negs[0]

random.shuffle(train_ids)
n_train = len(train_ids)

INCLUDE_MASTER = False
USE_CENTROID = False

batch_data, batch_dups, batch_bugs_anchor, batch_bugs_pos, batch_bugs_neg = [], [], [], [], []

for offset in range(batch_size):
    anchor, pos = train_ids[offset][0], train_ids[offset][1]
    batch_bugs_anchor.append(anchor)
    batch_bugs_pos.append(pos)
    batch_data.append(anchor)
    batch_data.append(pos)
    batch_dups += baseline.dup_sets_train[anchor]

print("Batch_dups", batch_dups)

for anchor, pos in zip(batch_bugs_anchor, batch_bugs_pos):
    while True:
        neg = get_neg_bug_semihard(batch_dups, anchor, buckets[issues_by_buckets[anchor]])

        if neg not in baseline.bug_set \
            or ((INCLUDE_MASTER or USE_CENTROID) and issues_by_buckets[neg] not in baseline.bug_set):
            continue
            
        batch_data.append(neg)
        batch_bugs_neg.append(neg)
        break;

In [None]:
for batch in batch_bugs:
    n = len(batch)
    print("Total of bugs in batch {}".format(n))
    how_many_has_neg_equal_anchor = len([b for b in batch if b[0] == b[2]])
    how_many_has_neg_exist_in_dups = len([b for b in batch if b[2] in buckets[issues_by_buckets[b[0]]]])
    print("Neg exists in Dups , {} ({:.2f}%)".format(how_many_has_neg_exist_in_dups, ((how_many_has_neg_exist_in_dups / n) * 100.0) ))
    print("Neg = Anchor , {} ({:.2f}%)".format(how_many_has_neg_equal_anchor, ((how_many_has_neg_equal_anchor / n) * 100.0) ))