# Introduction

In this notebook we demonstrate the use of **LDA (Latent Dirichlet Allocation)** generative statistical model for Information Retrieval technique to make trace link recovery between Test Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each test case content as an entire document that must be returned to the query made

# Import Libraries

In [1]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd

from modules.models_runner.tc_br_models_runner import TC_BR_Runner
from modules.models_runner.tc_br_models_runner import TC_BR_Models_Hyperp
from modules.utils import aux_functions
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import tokenizers as tok

from modules.models.lda import LDA

from IPython.display import display

import warnings; warnings.simplefilter('ignore')

# Load Datasets

In [2]:
tcs = [x for x in range(37,59)]
orc = fd.Tc_BR_Oracles.read_oracle_expert_df()
orc_subset = orc[orc.index.isin(tcs)]
#aux_functions.highlight_df(orc_subset)

OracleExpert.shape: (195, 91)


In [3]:
tcs = [13,37,60]
brs = [1267501]

testcases = fd.Datasets.read_testcases_df()
testcases = testcases[testcases.TC_Number.isin(tcs)]
bugreports = fd.Datasets.read_selected_bugreports_df()
bugreports = bugreports[bugreports.Bug_Number.isin(brs)]

print('tc.shape: {}'.format(testcases.shape))
print('br.shape: {}'.format(bugreports.shape))

TestCases.shape: (195, 12)
SelectedBugReports.shape: (91, 18)
tc.shape: (3, 12)
br.shape: (1, 18)


# Running LDA Model

In [4]:
corpus = testcases.tc_desc
query = bugreports.br_desc
test_cases_names = testcases.tc_name
bug_reports_names = bugreports.br_name

lda_hyperp = TC_BR_Models_Hyperp.get_lda_model_hyperp()
lda_hyperp['lda__lda_model__n_components'] = 3
#print(lda_hyperp)
lda_model = LDA(**lda_hyperp)
lda_model.set_name('LDA_Model_TC_BR')
lda_model.recover_links(corpus, query, test_cases_names, bug_reports_names)

 ..Total processing time: 0.92 seconds


In [5]:
lda_model.get_sim_matrix().shape

(3, 1)

In [6]:
sim_matrix = lda_model.get_sim_matrix()
aux_functions.highlight_df(sim_matrix)

br_name,BR_1267501_SRC
tc_name,Unnamed: 1_level_1
TC_13_TRG,0.250443
TC_37_TRG,0.995332
TC_60_TRG,0.261048


In [7]:
lda_model.print_topics()

Topic #0: theme instal awesom complet launch firefox bar browser nan new
Topic #1: scroll key config async make true sure apz page bar
Topic #2: bar page launch firefox issu ani use browser complet instal


In [8]:
df = pd.DataFrame(lda_model._corpus_matrix.toarray())
df.index = test_cases_names
df = df.T
df.index = lda_model.vectorizer.get_feature_names()
df.index.name = 'token'
print(df.shape)
aux_functions.highlight_df(df.head(15))

(61, 3)


tc_name,TC_13_TRG,TC_37_TRG,TC_60_TRG
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abl,0.0,0.0,0.0729981
activ,0.0,0.0,0.0729981
ani,0.125403,0.0628565,0.0431138
appear,0.0,0.0,0.145996
apz,0.0,0.212851,0.0
arrow,0.0,0.106425,0.0
async,0.0,0.212851,0.0
awesom,0.424652,0.0,0.0
awesomebar,0.212326,0.0,0.0
bar,0.322959,0.161878,0.0


In [9]:
df_q = pd.DataFrame(lda_model._query_vector.toarray())
df_q.index = bug_reports_names
df_q = df_q.T
df_q.index = lda_model.vectorizer.get_feature_names()
df_q.index.name = 'token'
print(df_q.shape)
aux_functions.highlight_df(df_q.iloc[30:50,:])

(61, 1)


br_name,BR_1267501_SRC
token,Unnamed: 1_level_1
lightweight,0.0
long,0.0
make,0.216908
manag,0.0
mous,0.0
,0.164964
new,0.329928
onc,0.0
open,0.216908
page,0.494892


In [10]:
components_df = pd.DataFrame(lda_model.lda_model.components_)
components_df.index = ['Topic #0', 'Topic #1', 'Topic #2']
components_df = components_df.T
components_df.index = lda_model.vectorizer.get_feature_names()
components_df.index.name = 'token'
print(components_df.shape)
aux_functions.highlight_df(components_df.head(10))

(61, 3)


Unnamed: 0_level_0,Topic #0,Topic #1,Topic #2
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abl,0.404751,0.334034,0.334213
activ,0.404751,0.334034,0.334213
ani,0.500819,0.395629,0.334926
appear,0.477288,0.334238,0.334471
apz,0.334001,0.544653,0.334197
arrow,0.333905,0.438447,0.334073
async,0.334001,0.544653,0.334197
awesom,0.756127,0.334191,0.334335
awesomebar,0.543811,0.334186,0.334329
bar,0.656362,0.493309,0.335166


In [11]:
aux_functions.highlight_df(pd.DataFrame(lda_model.out_1))

Unnamed: 0,0,1,2
0,0.859821,0.0704783,0.0697007
1,0.0691332,0.863085,0.0677819
2,0.844059,0.0775187,0.0784221


In [12]:
aux_functions.highlight_df(pd.DataFrame(lda_model.out_2))

Unnamed: 0,0,1,2
0,0.12753,0.772347,0.100123


In [13]:
lda_model.docs_feats_df

Unnamed: 0,mrw,dl
TC_13_TRG,"[awesom, launch, bar, firefox, set, display]",24
TC_37_TRG,"[scroll, key, sure, apz, make, async]",43
TC_60_TRG,"[theme, instal, complet, browser, appear, rest...",50
BR_1267501_SRC,"[scroll, page, new, true, open, make]",78
