# Introduction

In this notebook we demonstrate the use of **BM25 (Best Matching 25)** Information Retrieval technique to make trace link recovery between Test Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each test case content as an entire document that must be returned to the query made


# Import Libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd

from modules.models_runner.tc_br_models_runner import TC_BR_Runner
from modules.models_runner.tc_br_models_runner import TC_BR_Models_Hyperp
from modules.utils import aux_functions
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import tokenizers as tok

from modules.models.bm25 import BM_25

from IPython.display import display

import warnings; warnings.simplefilter('ignore')

# Load Datasets

In [3]:
tcs = [x for x in range(37,59)]
orc = fd.Tc_BR_Oracles.read_oracle_expert_df()
orc_subset = orc[orc.index.isin(tcs)]
#aux_functions.highlight_df(orc_subset)

OracleExpert.shape: (195, 91)


In [4]:
tcs = [13,37,60]
brs = [1267501]

testcases = fd.Datasets.read_testcases_df()
testcases = testcases[testcases.TC_Number.isin(tcs)]
bugreports = fd.Datasets.read_selected_bugreports_df()
bugreports = bugreports[bugreports.Bug_Number.isin(brs)]

print('tc.shape: {}'.format(testcases.shape))
print('br.shape: {}'.format(bugreports.shape))

TestCases.shape: (195, 12)
SelectedBugReports.shape: (91, 18)
tc.shape: (3, 12)
br.shape: (1, 18)


# Running BM25 Model

In [5]:
corpus = testcases.tc_desc
query = bugreports.br_desc
test_cases_names = testcases.tc_name
bug_reports_names = bugreports.br_name

bm25_hyperp = TC_BR_Models_Hyperp.get_bm25_model_hyperp()
bm25_model = BM_25(**bm25_hyperp)
bm25_model.set_name('BM25_Model_TC_BR')
bm25_model.recover_links(corpus, query, test_cases_names, bug_reports_names)

In [6]:
bm25_model.get_sim_matrix().shape

(3, 1)

In [7]:
sim_matrix_normalized = bm25_model.get_sim_matrix()
aux_functions.highlight_df(sim_matrix_normalized)

br_name,BR_1267501_SRC
tc_name,Unnamed: 1_level_1
TC_13_TRG,0.201572
TC_37_TRG,1.0
TC_60_TRG,0.0


In [8]:
sim_matrix_origin = bm25_model._sim_matrix_origin
aux_functions.highlight_df(sim_matrix_origin)

br_name,BR_1267501_SRC
tc_name,Unnamed: 1_level_1
TC_13_TRG,1.30565
TC_37_TRG,5.38892
TC_60_TRG,0.274786


In [9]:
df = pd.DataFrame()
df['tc'] = corpus
df.index = test_cases_names
df.index.name = ''
#df = df.T
df.head(10)

Unnamed: 0,tc
,
TC_13_TRG,13 20160603 + 20160624 + 20161014 1 New Awesom...
TC_37_TRG,37 20160603 + 20160708 3 APZ - Async Scrolling...
TC_60_TRG,60 20160722 4 Browser Customization browser cu...


Query Vector

In [10]:
tokenizer = tok.PorterStemmerBased_Tokenizer()
query_vec = [tokenizer.__call__(doc) for doc in query]
df_q = pd.DataFrame(query_vec)
df_q.index = bug_reports_names
df_q.index.name = ''
df_q = df_q.T
df_q.head(10)

Unnamed: 0,BR_1267501_SRC
0,new
1,privat
2,brows
3,overflow
4,side
5,make
6,content
7,unscrol
8,small
9,window


Average Document Length

In [11]:
bm25_model.bm25.avgdl

39.0

Number of documents

In [12]:
bm25_model.bm25.corpus_size

3

Term frequency by document

In [13]:
bm25_model.bm25.df['apz']

1

Most Relevant Words

In [14]:
bm25_model.mrw_tcs

[('TC_13_TRG', ['bar', 'awesom', 'launch', 'firefox', 'new', 'url']),
 ('TC_37_TRG', ['scroll', 'bar', 'key', 'use', 'make', 'page']),
 ('TC_60_TRG', ['theme', 'instal', 'abl', 'complet', 'browser', 'use'])]

In [15]:
bm25_model.docs_feats_df

Unnamed: 0,mrw,dl
TC_13_TRG,"[bar, awesom, launch, firefox, new, url]",24
TC_37_TRG,"[scroll, bar, key, use, make, page]",43
TC_60_TRG,"[theme, instal, abl, complet, browser, use]",50
BR_1267501_SRC,"[scroll, brows, page, use, new, firefox]",78
