# Introduction

In this notebook we demonstrate the use of **Word Embeddings (Word2Vec)** weighting technique into Information Retrieval to make trace link recovery between Test Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each test case content as an entire document that must be returned to the query made

# Import Libraries

In [1]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd

from modules.models_runner.tc_br_models_runner import TC_BR_Runner
from modules.models_runner.tc_br_models_runner import TC_BR_Models_Hyperp
from modules.utils import aux_functions
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import tokenizers as tok

from modules.models.wordvec import WordVec_BasedModel

from IPython.display import display

import warnings; warnings.simplefilter('ignore')

# Load Datasets

In [2]:
tcs = [x for x in range(37,59)]
orc = fd.Tc_BR_Oracles.read_oracle_expert_df()
orc_subset = orc[orc.index.isin(tcs)]
#aux_functions.highlight_df(orc_subset)

OracleExpert.shape: (195, 91)


In [3]:
tcs = [13,37,60]
brs = [1267501]

testcases = fd.Datasets.read_testcases_df()
testcases = testcases[testcases.TC_Number.isin(tcs)]
bugreports = fd.Datasets.read_selected_bugreports_df()
bugreports = bugreports[bugreports.Bug_Number.isin(brs)]

print('tc.shape: {}'.format(testcases.shape))
print('br.shape: {}'.format(bugreports.shape))

TestCases.shape: (195, 12)
SelectedBugReports.shape: (91, 18)
tc.shape: (3, 12)
br.shape: (1, 18)


# Running WordVector Model

In [4]:
corpus = testcases.tc_desc
query = bugreports.br_desc
test_cases_names = testcases.tc_name
bug_reports_names = bugreports.br_name

wv_hyperp = TC_BR_Models_Hyperp.get_w2v_model_hyperp()
wv_model = WordVec_BasedModel(**wv_hyperp)
wv_model.set_name('WV_Model_TC_BR')
wv_model.recover_links(corpus, query, test_cases_names, bug_reports_names)

In [5]:
wv_model.get_sim_matrix().shape

(3, 1)

In [6]:
sim_matrix = wv_model.get_sim_matrix()
aux_functions.highlight_df(sim_matrix)

br_name,BR_1267501_SRC
tc_name,Unnamed: 1_level_1
TC_13_TRG,0.886369
TC_37_TRG,0.940672
TC_60_TRG,0.925198


In [59]:
df_tcs = pd.DataFrame([tc.vector for tc in wv_model.tc_docs])
df_tcs.index = test_cases_names
print(df_tcs.shape)
df_tcs.iloc[0:5,0:4]

(3, 300)


Unnamed: 0_level_0,0,1,2,3
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TC_13_TRG,-0.022611,0.099271,-0.052506,-0.049523
TC_37_TRG,0.01722,0.163365,-0.1086,-0.033845
TC_60_TRG,0.11457,0.078011,-0.107812,-0.04774


In [58]:
df_brs = pd.DataFrame([wv_model.br_docs[0].vector])
df_brs.index = bug_reports_names
print(df_brs.shape)
df_brs.iloc[0:5,0:4]

(1, 300)


Unnamed: 0_level_0,0,1,2,3
br_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BR_1267501_SRC,0.066742,0.120455,-0.156679,0.029707


Word Vector Pseudo-Example

In [66]:
br_doc = wv_model.br_docs[0]
br_tokens = [token for token in br_doc]

df_w_emb = pd.DataFrame([br_doc[i].vector for i in range(len(br_doc))])
df_w_emb.index = br_tokens
df_w_emb.iloc[71:76,0:4]

Unnamed: 0,0,1,2,3
Try,-0.001388,0.031293,-0.52479,0.017237
to,0.31924,0.06316,-0.27858,0.2612
scroll,0.22814,-0.3502,-0.005245,0.13763
around,0.12713,0.087776,-0.17466,-0.006293
horizontally,0.54529,-0.44698,-0.1677,0.22094


Bug Report

In [54]:
br_doc = wv_model.br_docs[0]
br_tokens = [token for token in br_doc]

df_w_emb = pd.DataFrame([br_doc[i].vector for i in range(len(br_doc))])
df_w_emb.index = br_tokens
df_w_emb.iloc[24:27,0:4]

Unnamed: 0,0,1,2,3
window,0.68758,-0.34531,-0.18579,0.2387
sizes,0.1823,0.89011,-0.20703,-0.40545
Unspecified,0.091072,-0.006971,-0.19308,0.48667


Test Case 13

In [55]:
tc_doc = wv_model.tc_docs[0]
tc_tokens = [token for token in tc_doc]

df_w_emb = pd.DataFrame([tc_doc[i].vector for i in range(len(tc_doc))])
df_w_emb.index = tc_tokens
df_w_emb.iloc[30:33,0:4]

Unnamed: 0,0,1,2,3
Firefox,0.24572,-0.30072,0.40298,-0.12406
launches,-0.14742,0.095634,0.15905,0.039806
without,0.009888,0.027157,-0.3287,-0.031992


Test Case 37

In [56]:
tc_doc = wv_model.tc_docs[1]
tc_tokens = [token for token in tc_doc]

df_w_emb = pd.DataFrame([tc_doc[i].vector for i in range(len(tc_doc))])
df_w_emb.index = tc_tokens
df_w_emb.iloc[35:38,0:4]

Unnamed: 0,0,1,2,3
is,-0.084961,0.502,0.002382,-0.16755
true,0.096561,0.50832,-0.28025,-0.22873
in,0.089187,0.25792,0.26282,-0.029365


Test Case 60

In [57]:
tc_doc = wv_model.tc_docs[2]
tc_tokens = [token for token in tc_doc]

df_w_emb = pd.DataFrame([tc_doc[i].vector for i in range(len(tc_doc))])
df_w_emb.index = tc_tokens
df_w_emb.iloc[30:33,0:4]

Unnamed: 0,0,1,2,3
each,-0.17246,0.054016,0.034461,0.19666
theme,0.46639,0.2074,0.18573,-0.40191
installation,0.49132,-0.048056,0.2007,-0.21108
