In [1]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from annoy import AnnoyIndex
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [2]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

Using TensorFlow backend.


In [3]:
MAX_SEQUENCE_LENGTH_T = 20 # 20
MAX_SEQUENCE_LENGTH_D = 20 # 80
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000

'''
    Configuration
'''
epochs = 100
best_loss = 1
best_epoch = 0
verbose = 0
loss = 1

In [4]:
# Domain to use
DOMAIN = 'firefox'
'''
    propose_centroid_bert_
    propose_bert_triplet_
    propose_bert_
    baseline_dwen_
    baseline_
'''
METHOD = 'propose_bert_triplet_{}'.format(epochs)
EMBED_METHOD='bert'
ONLY_BUCKETS = False
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Path embeddings
EMBED_DIR='data/embed'
# Save model
SAVE_PATH = '{}_feature@number_of_epochs@epochs_64batch({})'.format(METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_feature_@number_of_epochs@epochs_64batch({})'.format(METHOD, DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

In [5]:
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [6]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=35814), HTML(value='')))




In [7]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


115814

In [8]:
%%time

experiment.load_bugs()
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=115814), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 9.51 s, sys: 1.06 s, total: 10.6 s
Wall time: 10.5 s


In [9]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=89061), HTML(value='')))




In [10]:
%%time

# path_train='train_chronological', path_test='test_chronological'
experiment.prepare_dataset(issues_by_buckets)
# Read and create the test queries duplicates
retrieval.create_queries()

Reading train data
Reading bug ids
CPU times: user 5min 44s, sys: 27.5 ms, total: 5min 44s
Wall time: 5min 44s


In [11]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

### Visualizing batchs

In [12]:
%%time

batch_size = 64
batch_size_test = 128

# we want a constant validation group to have a frame of reference for model performance
batch_triplets_valid, valid_input_sample, valid_input_pos, valid_input_neg, valid_sim = experiment.batch_iterator(None, 
                                                                                                      baseline.train_data, 
                                                                                                      baseline.dup_sets_train,
                                                                                                      bug_train_ids,
                                                                                                      batch_size_test, 1,
                                                                                                      issues_by_buckets)
test_gen = ([valid_input_sample['title'], valid_input_pos['title'], valid_input_neg['title'], 
             valid_input_sample['description'], valid_input_pos['description'], valid_input_neg['description'],
            valid_input_sample['info'], valid_input_pos['info'], valid_input_neg['info']], valid_sim)

CPU times: user 173 ms, sys: 4 µs, total: 173 ms
Wall time: 172 ms


#### Title

In [13]:
bug_set = baseline.bug_set


for i, (anchor, pos, neg) in enumerate(batch_triplets_valid):
    print("########### Batch #############")
    print("Title of anchor {}: {}".format(anchor, bug_set[anchor]['title']))
    print("Title of pos {}: {}".format(pos, bug_set[pos]['title']))
    print("Title of neg {}: {}".format(neg, bug_set[neg]['title']))
    print("Title of anchor {}: {}".format(anchor, bug_set[anchor]['title_word']))
    print("Title of pos {}: {}".format(pos, bug_set[pos]['title_word']))
    print("Title of neg {}: {}".format(neg, bug_set[neg]['title_word']))

########### Batch #############
Title of anchor 320485: [CLS] b ' right click options with more number of extensions installed ' [SEP]
Title of pos 292912: [CLS] b ' right click on a link produces a menu that goes up off the edge of the screen when you have plug ##ins installed ' [SEP]
Title of neg 306450: [CLS] b ' selection slide ##r from menu options does not work ' [SEP]
Title of anchor 320485: [  101  2157 11562  7047  2007  2062  2193  1997 14305  5361   102     0
     0     0     0     0     0     0     0     0]
Title of pos 292912: [  101  2157 11562  2006  1037  4957  7137  1037 12183  2008  3632  2039
  2125  1996  3341  1997  1996  3898  2043  2017]
Title of neg 306450: [  101  4989  7358  2099  2013 12183  7047  2515  2025  2147  1012   102
     0     0     0     0     0     0     0     0]
########### Batch #############
Title of anchor 434765: [CLS] b ' hiding status bar on mac causes odd scroll ##bar issue ' [SEP]
Title of pos 342751: [CLS] b ' i get no down arrow along t

#### Description

In [15]:
bug_set = baseline.bug_set

for i, (anchor, pos, neg) in enumerate(batch_triplets_valid):
    print("########### Batch #############")
    print("Description of anchor {}: {}".format(anchor, bug_set[anchor]['description']))
    print("Description of pos {}: {}".format(pos, bug_set[pos]['description']))
    print("Description of neg {}: {}".format(neg, bug_set[neg]['description']))
    print("Description of anchor {}: {}".format(anchor, bug_set[anchor]['description_word']))
    print("Description of pos {}: {}".format(pos, bug_set[pos]['description_word']))
    print("Description of neg {}: {}".format(neg, bug_set[neg]['description_word']))

########### Batch #############
Description of anchor 320485: [CLS] b ' user agent mo ##zi ##lla product u os en us rv ge ##cko person build id ##ent ##ifier mo ##zi ##lla product u os en us rv ge ##cko person the full options for the right cl ##ik ##c button for more number of extensions installed is coming properly rep ##rod ##ucible always steps to reproduce just check out clicking with the right cl ##ik ##c at different positions in the screen step index step index actual results the number tab ##s where not visible expected results some kind of calculation which will show you the options based on the space left on top or bottom i want to send a screen shot ' [SEP]
Description of pos 292912: [CLS] b ' user agent mo ##zi ##lla product u os en us rv ge ##cko person build id ##ent ##ifier mo ##zi ##lla product u os en us rv ge ##cko person i have several right click activated plug ##ins installed when i right click on a link the menu is positioned as if i had no plug ##ins since the p