In [1]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from annoy import AnnoyIndex
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [2]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

Using TensorFlow backend.


In [3]:
MAX_SEQUENCE_LENGTH_T = 20 # 20
MAX_SEQUENCE_LENGTH_D = 20 # 80
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000

'''
    Configuration
'''
epochs = 100
best_loss = 1
best_epoch = 0
verbose = 0
loss = 1

In [4]:
# Domain to use
DOMAIN = 'eclipse'
'''
    propose_centroid_bert_
    propose_bert_triplet_
    propose_bert_
    baseline_dwen_
    baseline_
'''
METHOD = 'baseline_{}'.format(epochs)
EMBED_METHOD='keras'
PREPROCESSING = 'baseline'
ONLY_BUCKETS = False
# Dataset paths
DIR = 'data/processed/{}/{}'.format(DOMAIN, PREPROCESSING)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Path embeddings
EMBED_DIR='data/embed'
# Save model
SAVE_PATH = '{}_preprocessing_{}_feature@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_preprocessing_{}_feature_@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

In [5]:
import os

pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [6]:
from keras_bert import load_vocabulary

token_dict = load_vocabulary(vocab_path)

In [7]:
token_dict['[CLS]'], token_dict['[SEP]']

(101, 102)

In [8]:
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 
                    token_dict['[CLS]'], token_dict['[SEP]'])
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [9]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

In [10]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


361006

In [11]:
%%time

experiment.load_bugs(EMBED_METHOD)
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 26.4 s, sys: 2.6 s, total: 29 s
Wall time: 28.9 s


In [12]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))




In [13]:
%%time

# path_train='train_chronological', path_test='test_chronological'
experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')
# Read and create the test queries duplicates
retrieval.create_queries()

CPU times: user 1.5 s, sys: 4.85 ms, total: 1.5 s
Wall time: 1.5 s


In [14]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

### Visualizing batchs

### Model

In [15]:
from keras.models import load_model
from keras_bert import get_custom_objects

file_model = os.path.join("modelos", "model_{}.h5".format(SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs))))

model = load_model(file_model, custom_objects=get_custom_objects())



In [16]:
%%time

batch_size = 64
batch_size_test = 128

# we want a constant validation group to have a frame of reference for model performance
batch_triplets_valid, valid_input_sample, valid_input_pos, valid_input_neg, valid_sim = experiment.batch_iterator(model, 
                                                                                                      baseline.train_data, 
                                                                                                      baseline.dup_sets_train,
                                                                                                      bug_train_ids,
                                                                                                      batch_size_test, 1,
                                                                                                      issues_by_buckets, 
                                                                                                      TRIPLET_HARD=True)

CPU times: user 43.3 s, sys: 52.7 s, total: 1min 36s
Wall time: 1min 20s


#### Title

In [17]:
bug_set = baseline.bug_set


for i, (anchor, pos, neg) in enumerate(batch_triplets_valid):
    print("########### Batch #############")
    print("Title of anchor {}: {}".format(anchor, bug_set[anchor]['title']))
    print("Title of pos {}: {}".format(pos, bug_set[pos]['title']))
    print("Title of neg {}: {}".format(neg, bug_set[neg]['title']))
    print("Title of anchor {}: {}".format(anchor, bug_set[anchor]['title_token']))
    print("Title of pos {}: {}".format(pos, bug_set[pos]['title_token']))
    print("Title of neg {}: {}".format(neg, bug_set[neg]['title_token']))

########### Batch #############
Title of anchor 40229: product crashes do not start with product
Title of pos 40821: organization crashes after upgrading to product
Title of neg 19111: different out folder drive for class files
Title of anchor 40229: [  29 1436   86   19  131   32   29    0    0    0    0    0    0    0
    0    0    0    0    0    0]
Title of pos 40821: [   7 1436  217 3629    8   29    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
Title of neg 19111: [ 450  211  236 2807   22   38  181    0    0    0    0    0    0    0
    0    0    0    0    0    0]
########### Batch #############
Title of anchor 243073: organization crashes after saving compiling after changing a java file number to number times
Title of pos 218311: organization crashes on bit product
Title of neg 105228: error in component structural builder
Title of anchor 243073: [   7 1436  217 1635 2003  217 1096   11    3   39   10    8   10  854
    0    0    0    0    0    0]
Title o

In [18]:
bug_set = baseline.bug_set

anchor_neg_equal_desc = 0
pos_neg_equal_desc = 0
batch_equal_desc = 0
anchor_pos_equal_desc = 0

for i, (anchor, pos, neg) in enumerate(batch_triplets_valid):
    if np.array_equal(bug_set[anchor]['title_token'], bug_set[neg]['title_token']):
        anchor_neg_equal_desc +=1
    if np.array_equal(bug_set[pos]['title_token'], bug_set[neg]['title_token']):
        pos_neg_equal_desc +=1
    if np.array_equal(bug_set[anchor]['title_token'], bug_set[pos]['title_token']):
        anchor_pos_equal_desc +=1
    if np.array_equal(bug_set[anchor]['title_token'], bug_set[pos]['title_token']) and \
        np.array_equal(bug_set[anchor]['title_token'], bug_set[neg]['title_token']):
        batch_equal_desc +=1
        
print("batch_equal_title {}({:.2f}%)".format(batch_equal_desc, (batch_equal_desc / batch_size) * 100.0))
print("anchor_neg_equal_title {}({:.2f}%)".format(anchor_neg_equal_desc, (anchor_neg_equal_desc / batch_size) * 100.0))
print("pos_neg_equal_title {}({:.2f}%)".format(pos_neg_equal_desc, (pos_neg_equal_desc / batch_size) * 100.0))
print("anchor_pos_equal_tile {}({:.2f}%)".format(anchor_pos_equal_desc, (anchor_pos_equal_desc / batch_size) * 100.0))

batch_equal_title 0(0.00%)
anchor_neg_equal_title 0(0.00%)
pos_neg_equal_title 0(0.00%)
anchor_pos_equal_tile 2(3.12%)


#### Description

In [19]:
bug_set = baseline.bug_set

for i, (anchor, pos, neg) in enumerate(batch_triplets_valid):
    print("########### Batch #############")
    print("Description of anchor {}: {}".format(anchor, bug_set[anchor]['description']))
    print("Description of pos {}: {}".format(pos, bug_set[pos]['description']))
    print("Description of neg {}: {}".format(neg, bug_set[neg]['description']))
    print("Description of anchor {}: {}".format(anchor, bug_set[anchor]['description_token']))
    print("Description of pos {}: {}".format(pos, bug_set[pos]['description_token']))
    print("Description of neg {}: {}".format(neg, bug_set[neg]['description_token']))

########### Batch #############
Description of anchor 40229: an unexpected exception has been detected in native code outside the organization unexpected signal exception access violation xc occurred at pc x e a c function is char alpha a x d library organization thread at org eclipse swt internal win os draw text w native method at org eclipse swt internal win os draw text os java at org eclipse swt graphics gc draw text gc java at org eclipse swt graphics gc draw text gc java at org eclipse swt custom clabel on paint clabel java at org eclipse swt custom clabel access clabel java at org eclipse swt custom clabel paint control clabel java at org eclipse swt widgets typed listener handle event typed listener java at org eclipse swt widgets event table send event event table java at org eclipse swt widgets widget send event widget java at org eclipse swt widgets widget send event widget java at org eclipse swt widgets widget send event widget java at org eclipse swt widgets composite wm

In [20]:
# bugs = {}
# orig ={}
# for bug_id in bug_set.keys():
#     bug = bug_set[bug_id]
#     size = 220
#     orig[bug['description'][:size]]=bug['description']
#     if bug['description'][:size] not in bugs:
#         bugs[bug['description'][:size]]=1
#     else: 
#         bugs[bug['description'][:size]]+=1

In [21]:
# for i in bugs.keys():
#     if bugs[i] > 10:
#         print(i, orig[i])

In [22]:
bug_set = baseline.bug_set

anchor_neg_equal_desc = 0
pos_neg_equal_desc = 0
batch_equal_desc = []
anchor_pos_equal_desc = 0

for i, (anchor, pos, neg) in enumerate(batch_triplets_valid):
    if np.array_equal(bug_set[anchor]['description_token'], bug_set[neg]['description_token']):
        anchor_neg_equal_desc +=1
    if np.array_equal(bug_set[pos]['description_token'], bug_set[neg]['description_token']):
        pos_neg_equal_desc +=1
    if np.array_equal(bug_set[anchor]['description_token'], bug_set[pos]['description_token']):
        anchor_pos_equal_desc +=1
    if np.array_equal(bug_set[anchor]['description_token'], bug_set[pos]['description_token']) and \
        np.array_equal(bug_set[anchor]['description_token'], bug_set[neg]['description_token']):
        batch_equal_desc.append([anchor, neg])
        
print("batch_equal_desc {}({:.2f}%)".format(len(batch_equal_desc), (len(batch_equal_desc) / batch_size) * 100.0))
print("anchor_neg_equal_desc {}({:.2f}%)".format(anchor_neg_equal_desc, (anchor_neg_equal_desc / batch_size) * 100.0))
print("pos_neg_equal_desc {}({:.2f}%)".format(pos_neg_equal_desc, (pos_neg_equal_desc / batch_size) * 100.0))
print("anchor_pos_equal_desc {}({:.2f}%)".format(anchor_pos_equal_desc, (anchor_pos_equal_desc / batch_size) * 100.0))

batch_equal_desc 0(0.00%)
anchor_neg_equal_desc 0(0.00%)
pos_neg_equal_desc 0(0.00%)
anchor_pos_equal_desc 8(12.50%)


### Bug description equals

In [23]:
bug_set = baseline.bug_set

for i, (anchor, neg) in enumerate(batch_equal_desc):
    print("########### Batch #############")
#     print("Description of anchor {}: {}".format(anchor, bug_set[anchor]))
#     print("Description of neg {}: {}".format(neg, bug_set[neg]))
    print("Description of anchor {}: {}".format(anchor, bug_set[anchor]['description_token']))
    print("Description of neg {}: {}".format(neg, bug_set[neg]['description_token']))
    print("Description of anchor {}: {}".format(anchor, bug_set[anchor]['description_token']))
    print("Description of neg {}: {}".format(neg, bug_set[neg]['description_token']))

### Batch BERT

In [24]:
batch_triplet_train, \
        train_input_sample, train_input_pos, train_input_neg, train_master_input, train_master_neg, \
            train_sim = experiment.batch_iterator_bert(model, baseline.train_data, baseline.dup_sets_train, \
                                                       bug_train_ids, 
                                                       batch_size, 1, 
                                                       issues_by_buckets, 
                                                       TRIPLET_HARD=True, USE_CENTROID=True, method='keras')

#### Title

In [25]:
len(train_master_input['centroid_embed']), len(batch_triplet_train)

(64, 64)

In [26]:
bug_set = baseline.bug_set


for i, (anchor, pos, neg, centroid_embed_pos, centroid_embed_neg) in enumerate(batch_triplet_train):
    print("########### Batch #############")
    print("Title of anchor {}: {}".format(anchor, bug_set[anchor]['title']))
    print("Title of pos {}: {}".format(pos, bug_set[pos]['title']))
    print("Title of neg {}: {}".format(neg, bug_set[neg]['title']))
    print("Title of anchor {}: {}".format(anchor, train_input_sample['title']['token'][i]))
    print("Title of pos {}: {}".format(pos, train_input_pos['title']['token'][i]))
    print("Title of neg {}: {}".format(neg, train_input_neg['title']['token'][i]))
    print("Title of centroid_anchor_pos {}: {}".format(anchor, centroid_embed_pos['centroid_embed'][:20]))
    print("Title of centroid_neg {}: {}".format(neg, centroid_embed_neg['centroid_embed'][:20]))

########### Batch #############
Title of anchor 107520: organization crashes on product bit
Title of pos 90797: random crash with m linux gtk amd
Title of neg 111105: can not select java class in drag drop listener providers extension
Title of anchor 107520: [   7 1436   31   29  976    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
Title of pos 90797: [2089 1239   32  177  573  406 2266    0    0    0    0    0    0    0
    0    0    0    0    0    0]
Title of neg 111105: [  73   19  168    3   38   12  760  571  205 1362  255    0    0    0
    0    0    0    0    0    0]
Title of centroid_anchor_pos 107520: [0.021624187007546425, -0.03916782885789871, 0.015942726284265518, -0.029586311429739, -0.024304823949933052, 0.0123555613681674, -0.03847341239452362, 0.01624973490834236, 0.008031495846807957, -0.03261992707848549, 0.03816886246204376, 0.027485257014632225, -0.02860763855278492, 0.012449117377400398, -0.055715128779411316, -0.03628017380833626, -0.05

In [27]:
bug_set = baseline.bug_set

anchor_neg_equal_desc = 0
pos_neg_equal_desc = 0
batch_equal_desc = 0
anchor_pos_equal_desc = 0
centroids_equal = 0

for i, (anchor, pos, neg, centroid_embed_pos, centroid_embed_neg) in enumerate(batch_triplet_train):
    if np.array_equal(train_input_sample['title']['token'][i], train_input_neg['title']['token'][i]):
        anchor_neg_equal_desc +=1
    if np.array_equal(train_input_pos['title']['token'][i], train_input_neg['title']['token'][i]):
        pos_neg_equal_desc +=1
    if np.array_equal(train_input_sample['title']['token'][i], train_input_pos['title']['token'][i]):
        anchor_pos_equal_desc +=1
    if np.array_equal(train_input_sample['title']['token'][i], train_input_pos['title']['token'][i]) and \
        np.array_equal(train_input_sample['title']['token'][i], train_input_neg['title']['token'][i]):
        batch_equal_desc +=1
    if np.array_equal(centroid_embed_pos['centroid_embed'], centroid_embed_neg['centroid_embed']):
        centroids_equal +=1
        
print("batch_equal_title {}({:.2f}%)".format(batch_equal_desc, (batch_equal_desc / batch_size) * 100.0))
print("anchor_neg_equal_title {}({:.2f}%)".format(anchor_neg_equal_desc, (anchor_neg_equal_desc / batch_size) * 100.0))
print("pos_neg_equal_title {}({:.2f}%)".format(pos_neg_equal_desc, (pos_neg_equal_desc / batch_size) * 100.0))
print("anchor_pos_equal_title {}({:.2f}%)".format(anchor_pos_equal_desc, (anchor_pos_equal_desc / batch_size) * 100.0))
print("centroid_equal_title {}({:.2f}%)".format(centroids_equal, (centroids_equal / batch_size) * 100.0))

batch_equal_title 0(0.00%)
anchor_neg_equal_title 0(0.00%)
pos_neg_equal_title 0(0.00%)
anchor_pos_equal_title 1(1.56%)
centroid_equal_title 0(0.00%)


#### Eclipse
- batch_equal_title 0(0.00%)
- anchor_neg_equal_title 0(0.00%)
- pos_neg_equal_title 0(0.00%)
- anchor_pos_equal_title 3(4.69%)
- centroid_equal_title 0(0.00%)

#### Description

In [28]:
bug_set = baseline.bug_set


for i, (anchor, pos, neg, centroid_embed_pos, centroid_embed_neg) in enumerate(batch_triplet_train):
    print("########### Batch #############")
    print("Description of anchor {}: {}".format(anchor, bug_set[anchor]['description']))
    print("Description of pos {}: {}".format(pos, bug_set[pos]['description']))
    print("Description of neg {}: {}".format(neg, bug_set[neg]['description']))
    print("Description of anchor {}: {}".format(anchor, train_input_sample['description']['token'][i]))
    print("Description of pos {}: {}".format(pos, train_input_pos['description']['token'][i]))
    print("Description of neg {}: {}".format(neg, train_input_neg['description']['token'][i]))
    print("Description of centroid_anchor_pos {}: {}".format(anchor, centroid_embed_pos['centroid_embed'][:MAX_SEQUENCE_LENGTH_D]))
    print("Description of centroid_neg {}: {}".format(neg, centroid_embed_neg['centroid_embed'][:MAX_SEQUENCE_LENGTH_D]))

########### Batch #############
Description of anchor 107520: product crashes very often but unpredictably sometimes i can work time sometimes it crashes multiple times within x timeutes here are the logs http www nightlabs de o eclipse i hope they are helpful i posted the bug here jdt organization because in most of the cases the thread which causes the crash is org eclipse jdt internal ui text java reconciler usually it happens while i am changing some java source code in the java editor i am running su se os number bit as you can see in the logs the bug exists in both version number and number best regards o
Description of pos 90797: eclipse crashes but this does not appear to happen with some particular operation an unexpected error has been detected by organization number organization xb at pc x a a cb d person tid organization person bit server vm bx timeixed mode number problematic frame j org eclipse jdt internal compiler parser scanner get next token i attached is the complete

In [29]:
bug_set = baseline.bug_set

anchor_neg_equal_desc = 0
anchor_neg_equal_desc_ids = []
pos_neg_equal_desc = 0
batch_equal_desc = 0
anchor_pos_equal_desc = 0
centroids_equal = 0

for i, (anchor, pos, neg, centroid_embed_pos, centroid_embed_neg) in enumerate(batch_triplet_train):
    if np.array_equal(train_input_sample['description']['token'][i], train_input_neg['description']['token'][i]):
        anchor_neg_equal_desc +=1
        anchor_neg_equal_desc_ids.append([anchor, neg])
    if np.array_equal(train_input_pos['description']['token'][i], train_input_neg['description']['token'][i]):
        pos_neg_equal_desc +=1
    if np.array_equal(train_input_sample['description']['token'][i], train_input_pos['description']['token'][i]):
        anchor_pos_equal_desc +=1
    if np.array_equal(train_input_sample['description']['token'][i], train_input_pos['description']['token'][i]) and \
        np.array_equal(train_input_sample['description']['token'][i], train_input_neg['description']['token'][i]):
        batch_equal_desc +=1
    if np.array_equal(centroid_embed_pos['centroid_embed'], centroid_embed_neg['centroid_embed']):
        centroids_equal +=1
        
print("batch_equal_description {}({:.2f}%)".format(batch_equal_desc, (batch_equal_desc / batch_size) * 100.0))
print("anchor_neg_equal_description {}({:.2f}%)".format(anchor_neg_equal_desc, (anchor_neg_equal_desc / batch_size) * 100.0))
print("pos_neg_equal_description {}({:.2f}%)".format(pos_neg_equal_desc, (pos_neg_equal_desc / batch_size) * 100.0))
print("anchor_pos_equal_description {}({:.2f}%)".format(anchor_pos_equal_desc, (anchor_pos_equal_desc / batch_size) * 100.0))
print("centroid_equal_description {}({:.2f}%)".format(centroids_equal, (centroids_equal / batch_size) * 100.0))

batch_equal_description 0(0.00%)
anchor_neg_equal_description 0(0.00%)
pos_neg_equal_description 0(0.00%)
anchor_pos_equal_description 2(3.12%)
centroid_equal_description 0(0.00%)


In [30]:
anchor_neg_equal_desc_ids

[]

#### Eclipse
- batch_equal_description 0(0.00%)
- anchor_neg_equal_description 1(1.56%)
- pos_neg_equal_description 0(0.00%)
- anchor_pos_equal_description 7(10.94%)
- centroid_equal_description 0(0.00%)

Empty title are considered as equal when compare anchor and neg with this case

[[49686, 63093]]

### Duplicates on batch

In [31]:
buckets = retrieval.buckets

In [32]:
buckets_anchor = {}
buckets_pos = {}
buckets_neg = {}
dups_alone = []
pos_equal_neg = 0
anchor_equal_neg = 0
batch_size = len(batch_triplet_train)

def count_freq(anchor, pos, neg, counter):
    n_dups = len(buckets[issues_by_buckets[anchor]])
    # Anchor and Neg
    if n_dups in counter:
        counter[n_dups] += 1
    else:
        counter[n_dups] = 1
    if n_dups == 1:
        dups_alone.append((anchor, pos, neg))

for i, (anchor, pos, neg, centroid_embed_pos, centroid_embed_neg) in enumerate(batch_triplet_train):
    n_dups = len(buckets[issues_by_buckets[anchor]])
    # Anchor and Neg
    count_freq(anchor, pos, neg, buckets_anchor)
    # Anchor and Pos
    count_freq(anchor, pos, neg, buckets_pos)
    # Neg
    count_freq(anchor, pos, neg, buckets_neg)
    # Pos and neg equals
    if pos == neg:
        pos_equal_neg += 1
    if anchor == neg:
        anchor_equal_neg += 1
print("Pos and Neg equals {}({:.2f}%)".format(pos_equal_neg, (pos_equal_neg / batch_size) * 100.0))
print("Anchor and Neg equals {}({:.2f}%)".format(anchor_equal_neg, (anchor_equal_neg / batch_size) * 100.0))

Pos and Neg equals 0(0.00%)
Anchor and Neg equals 0(0.00%)


In [33]:
buckets_pos, buckets_neg

({3: 3,
  4: 3,
  5: 6,
  6: 2,
  7: 2,
  8: 4,
  9: 3,
  10: 1,
  11: 1,
  13: 2,
  15: 3,
  16: 1,
  17: 2,
  18: 1,
  19: 2,
  21: 2,
  23: 3,
  24: 1,
  26: 1,
  28: 1,
  30: 1,
  33: 1,
  35: 5,
  40: 1,
  42: 4,
  46: 4,
  51: 4},
 {3: 3,
  4: 3,
  5: 6,
  6: 2,
  7: 2,
  8: 4,
  9: 3,
  10: 1,
  11: 1,
  13: 2,
  15: 3,
  16: 1,
  17: 2,
  18: 1,
  19: 2,
  21: 2,
  23: 3,
  24: 1,
  26: 1,
  28: 1,
  30: 1,
  33: 1,
  35: 5,
  40: 1,
  42: 4,
  46: 4,
  51: 4})

In [34]:
dups_alone

[]

Bucket with wrong bucket selected issue=214829

[(219540, 219541, 214829), (214829, 203661, 203661)]

In [35]:
buckets[issues_by_buckets[214829]]

{203661, 214829, 226763}

In [36]:
issues_by_buckets[203661], buckets[issues_by_buckets[203661]]

(226763, {203661, 214829, 226763})

In [37]:
issues_by_buckets[214829], buckets[issues_by_buckets[214829]]

(226763, {203661, 214829, 226763})