# Introduction - Oracle Version 1

In this notebook we create a version of the oracle based on the automatic setting of links between bug reports and test cases through the test days information.

* a first version of oracle (trace matrix) is created using trace links created automatically, this approach involves the concepts of strong and weak links into the oracle.

The oracle is created based on the supposed relationship between the bug report and all the features tested in given testday. This approach is problematic.

**OBS: the main problem with the automatic approach is the fact that the majority of trace links created are weak links**, which means that a bug report is related to test cases that it should not be only because they belong to the same Firefox Version.

# Load Dataset and Libraries

In [1]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd
import numpy as np
from sklearn.externals.joblib import Parallel, delayed
from tqdm import tqdm

from modules.utils import aux_functions
from modules.utils import firefox_dataset_p1 as fd

In [2]:
bugreports_final = fd.read_bugreports_df()
testcases = fd.read_testcases_df()

BugReports.shape: (35336, 18)
TestCases.shape: (207, 12)


# Oracle

## Amount of Links (Positives and Negatives) Expected

In [3]:
print('Expected instances amount: {} x {} = {}'.format(len(bugreports_final), len(testcases), len(bugreports_final) * len(testcases)))

print('Num BRs 48 Branch: {}'.format(len(bugreports_final[bugreports_final.Version == '48 Branch'])))
print('Num BRs 49 Branch: {}'.format(len(bugreports_final[bugreports_final.Version == '49 Branch'])))
print('Num BRs 50 Branch: {}'.format(len(bugreports_final[bugreports_final.Version == '50 Branch'])))
print('Num BRs 51 Branch: {}'.format(len(bugreports_final[bugreports_final.Version == '51 Branch'])))

print('Num TCs: {}'.format(len(testcases)))
print('Num BRs: {}'.format(len(bugreports_final)))

Expected instances amount: 35336 x 207 = 7314552
Num BRs 48 Branch: 414
Num BRs 49 Branch: 356
Num BRs 50 Branch: 518
Num BRs 51 Branch: 461
Num TCs: 207
Num BRs: 35336


## Estimating Oracle Memory Size

In [4]:
ex_df = pd.DataFrame(index=testcases.tc_name, columns=bugreports_final.br_name, data=0, dtype='int8')
print(ex_df.shape)
print(ex_df.info())
ex_df.head()

(207, 35336)
<class 'pandas.core.frame.DataFrame'>
Index: 207 entries, TC_1_TRG to TC_207_TRG
Columns: 35336 entries, BR_506297_SRC to BR_1277257_SRC
dtypes: int8(35336)
memory usage: 7.0+ MB
None


br_name,BR_506297_SRC,BR_506338_SRC,BR_506507_SRC,BR_506550_SRC,BR_506575_SRC,BR_506729_SRC,BR_506768_SRC,BR_506795_SRC,BR_506820_SRC,BR_506831_SRC,...,BR_1276070_SRC,BR_1276152_SRC,BR_1276447_SRC,BR_1276656_SRC,BR_1276818_SRC,BR_1276884_SRC,BR_1276966_SRC,BR_1277114_SRC,BR_1277151_SRC,BR_1277257_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TC_1_TRG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TC_2_TRG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TC_3_TRG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TC_4_TRG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TC_5_TRG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Oracle Functions and Auxiliary Variables

In [27]:
list_fversion_to_testday = [('48 Branch','20160603'),('48 Branch','20160624'),('48 Branch','20160708'),
                            ('49 Branch','20160722'),('49 Branch','20160812'),('49 Branch','20160826'),
                            ('50 Branch','20160909'),('50 Branch','20160930'),('50 Branch','20161014'),
                            ('51 Branch','20161028'),('51 Branch','20161125'),('51 Branch','20170106')]

NUMBER_SUBSETS = 8

def check_link_condition(br, tc):
    for tup in [(br['Version'],tday) for tday in tc['TestDay'].split(' + ')]:
        if tup in list_fversion_to_testday:
            return True
    return False


def create_links(idx, tc_df, br_df):
    oracle_df = pd.DataFrame(columns=br_df.br_name, index=tc_df.tc_name, data=np.zeros(shape=(len(tc_df),len(br_df))), dtype='int8')
    for idx_1,br in tqdm(br_df.iterrows()):
        for idx_2,tc in tc_df.iterrows():
            if check_link_condition(br, tc):
                oracle_df.at[tc.tc_name, br.br_name] = 1
            else:
                oracle_df.at[tc.tc_name, br.br_name] = 0
    
    oracle_df.to_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/firefox_v1/part/trace_matrix_{}.csv'.format(idx))

def create_br_dfs_list():
    list_br_dfs = []
    for i in range(0, len(bugreports_final), 5045):   # len(bugreports_final) / 5045 <= NUMBER_SUBSETS
        list_br_dfs.append(bugreports_final.iloc[i:i+5045,:])
    return list_br_dfs

def create_tc_dfs_list():
    return [testcases.copy() for i in range(NUMBER_SUBSETS)]

## Create Small Size Oracle

In [28]:
br_aux = bugreports_final[(bugreports_final.Version == '50 Branch') | (bugreports_final.Version == '60 Branch')].sample(15, random_state=42)
tc_aux = testcases[(testcases.TestDay.str.contains('20161014')) | (testcases.TestDay.str.contains('20161028'))].sample(10, random_state=1000)

br_aux[br_aux.Version == '50 Branch'].loc[:, ['Bug_Number','Version']].head(100)

Unnamed: 0,Bug_Number,Version
14902,1319983,50 Branch
14763,1318407,50 Branch
12484,1287109,50 Branch
14981,1320548,50 Branch
15367,1325288,50 Branch
12059,1280856,50 Branch


In [29]:
tc_aux[tc_aux.TestDay.str.contains('20161014')].loc[:,['TC_Number','TestDay']].head(100)

Unnamed: 0,TC_Number,TestDay
18,19,20160603 + 20160624 + 20161014
15,16,20160603 + 20160624 + 20161014
14,15,20160603 + 20160624 + 20161014


In [30]:
create_links('small', tc_aux, br_aux)

small_orc = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/firefox_v1/part/trace_matrix_small.csv')
aux_functions.highlight_df(small_orc)

15it [00:00, 748.09it/s]


Unnamed: 0,tc_name,BR_1441532_SRC,BR_1319983_SRC,BR_1443343_SRC,BR_1464815_SRC,BR_1318407_SRC,BR_1468122_SRC,BR_1445895_SRC,BR_1459431_SRC,BR_1287109_SRC,BR_1320548_SRC,BR_1469153_SRC,BR_1325288_SRC,BR_1463768_SRC,BR_1469753_SRC,BR_1280856_SRC
0,TC_165_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,TC_19_TRG,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1
2,TC_152_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,TC_16_TRG,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1
4,TC_160_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,TC_169_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,TC_145_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,TC_15_TRG,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1
8,TC_149_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,TC_167_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Create Entire Oracle

In [31]:
tasks = [(idx,tc_df, br_df) for idx,(tc_df,br_df) in enumerate(zip(create_tc_dfs_list(),create_br_dfs_list()))]
results = Parallel(n_jobs=7, verbose=3)(delayed(create_links)(idx,tc_df,br_df) for idx,tc_df,br_df in tasks)

[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   4 out of   8 | elapsed:  3.8min remaining:  3.8min
[Parallel(n_jobs=7)]: Done   8 out of   8 | elapsed:  3.9min finished


## Analyze Oracle Parts Created

In [32]:
oo_df_2 = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/firefox_v1/part/trace_matrix_2.csv')
oo_df_2.set_index('tc_name', inplace=True)

print(oo_df_2.loc['TC_15_TRG', 'BR_1319983_SRC'])
print(oo_df_2.loc['TC_16_TRG', 'BR_1319983_SRC'])
print(oo_df_2.loc['TC_19_TRG', 'BR_1319983_SRC'])

1
1
1


In [33]:
oo_dfs = []
for i in range(NUMBER_SUBSETS):
    df = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/firefox_v1/part/trace_matrix_{}.csv'.format(i))
    oo_dfs.append(df)
    print(df.shape)

(207, 5046)
(207, 5046)
(207, 5046)
(207, 5046)
(207, 5046)
(207, 5046)
(207, 5046)
(207, 22)


## Join Oracle Parts

In [34]:
oo_df = pd.DataFrame(index=testcases.tc_name, dtype='int8')
for i in range(NUMBER_SUBSETS):
    aux_df = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/firefox_v1/part/trace_matrix_{}.csv'.format(i))
    aux_df.set_index('tc_name', inplace=True)
    oo_df = oo_df.join(aux_df)

print(oo_df.shape)
print(oo_df.info())

(207, 35336)
<class 'pandas.core.frame.DataFrame'>
Index: 207 entries, TC_1_TRG to TC_207_TRG
Columns: 35336 entries, BR_506297_SRC to BR_1277257_SRC
dtypes: int64(35336)
memory usage: 55.8+ MB
None


## Save Oracle

In [35]:
oo_df.to_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/firefox_v1/trace_matrix_final.csv')

# Tests

## Checking Values [0]

Analyze Entire Oracle Created

In [36]:
oo_df_full = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/firefox_v1/trace_matrix_final.csv')
oo_df_full.set_index('tc_name', inplace=True)

print(oo_df_full.loc['TC_15_TRG', 'BR_1319983_SRC'])
print(oo_df_full.loc['TC_16_TRG', 'BR_1319983_SRC'])
print(oo_df_full.loc['TC_19_TRG', 'BR_1319983_SRC'])

1
1
1


## Checking Values [1]

FVersion to TestDay

In [37]:
ck_df = pd.DataFrame(columns=['testday','f_version','features_released','testcases_list'])
ck_df.testday = ['20160603', '20160624', '20160708', 
                 '20160722', '20160812', '20160826', 
                 '20160909', '20160930', '20161014', 
                 '20161028', '20161125', '20170106']
ck_df.f_version = ['48 Branch', '48 Branch', '48 Branch', 
                  '49 Branch', '49 Branch', '49 Branch', 
                  '50 Branch', '50 Branch', '50 Branch', 
                  '51 Branch', '51 Branch', '51 Branch' ]
ck_df.features_released = [
    "Awesome Bar Search, Awesome Bar Icons - Left, Awesome Bar Icons - Right",
    "Awesome Bar Search, Awesome Bar Icons - Left, Awesome Bar Icons - Right",
    "apz, Scrolling using different devices (wired mouse, wireless mouse, trackpad/touchpad) - where available devices",
    'context menu - exploratory testing, context menu - full functional testing, pdf viewer, browser customization',
    'windows 10 compatibility, text to speech in reader mode, text to speech on desktop',
    'webgl compatibility, exploratory testing',
    '',
    'Pointer Lock API, WebM EME support for Widevine',
    'New Awesome Bar',
    'Zoom indicator, Downloads dropmaker',
    'WebGL2,  FLAC support,  Indicator for device permissions,  Zoom Indicator',
    'WebGL2, Zoom Indicator, Flash support']

ck_df.testcases_list = ""

included = []
for i,tc in testcases.iterrows():
    for j,row in ck_df.iterrows(): 
        if row['testday'] in tc['TestDay']:
            if ck_df.at[j,'testcases_list'] == "":
                ck_df.at[j,'testcases_list'] = str(tc.TC_Number)
            else:
                ck_df.at[j,'testcases_list'] = ck_df.at[j,'testcases_list'] + " " + str(tc.TC_Number)
            if tc.TC_Number not in included:
                included.append(tc.TC_Number)

ck_df.head(20)

Unnamed: 0,testday,f_version,features_released,testcases_list
0,20160603,48 Branch,"Awesome Bar Search, Awesome Bar Icons - Left, ...",13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 2...
1,20160624,48 Branch,"Awesome Bar Search, Awesome Bar Icons - Left, ...",13 14 15 16 17 18 19 20 21 22 23 24 25
2,20160708,48 Branch,"apz, Scrolling using different devices (wired ...",37 38 39 40 41 42 43 44 45 46 47
3,20160722,49 Branch,"context menu - exploratory testing, context me...",59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 7...
4,20160812,49 Branch,"windows 10 compatibility, text to speech in re...",104 105 106 107 108 109 110 111 112 113 114 11...
5,20160826,49 Branch,"webgl compatibility, exploratory testing",120 121 122 123 124
6,20160909,50 Branch,,
7,20160930,50 Branch,"Pointer Lock API, WebM EME support for Widevine",125 126 127 128 129 130 131 132 133 134 135 13...
8,20161014,50 Branch,New Awesome Bar,13 14 15 16 17 18 19 20 21
9,20161028,51 Branch,"Zoom indicator, Downloads dropmaker",142 143 144 145 146 147 148 149 150 151 152 15...


In [38]:
ck_df.to_csv('../data/mozilla_firefox_v2/firefoxDataset/docs_english/TD_2_FVersion/testday_to_fversion.csv')

## Checking Values [3]

Analyze Amount of Different Test Cases by Firefox Version

In [39]:
testcases.TestDay.value_counts()

20160722                          45
20160603                          22
20161125                          22
20161028                          18
20160930                          17
20160812                          16
20161125 + 20170106               13
20181221                          12
20161028 + 20161125 + 20170106    11
20160603 + 20160708               11
20160603 + 20160624 + 20161014     9
20160826                           5
20160603 + 20160624                4
20170106                           2
Name: TestDay, dtype: int64

In [40]:
testcases.TestDay.value_counts().sum()

207

In [41]:
b48_0 = ck_df.loc[0].testcases_list.split(' ')
b48_1 = ck_df.loc[1].testcases_list.split(' ')
b48_2 = ck_df.loc[2].testcases_list.split(' ')
b48 = b48_0 + b48_1 + b48_2
print('Amount Different Test Cases - 48 Branch: {}'.format(len(set(b48))))

b49_0 = ck_df.loc[3].testcases_list.split(' ')
b49_1 = ck_df.loc[4].testcases_list.split(' ')
b49_2 = ck_df.loc[5].testcases_list.split(' ')
b49 = b49_0 + b49_1 + b49_2
print('Amount Different Test Cases - 49 Branch: {}'.format(len(set(b49))))

#b50_0 = ck_df.loc[6].testcases_list.split(' ')
b50_1 = ck_df.loc[7].testcases_list.split(' ')
b50_2 = ck_df.loc[8].testcases_list.split(' ')
b50 = b50_1 + b50_2
print('Amount Different Test Cases - 50 Branch: {}'.format(len(set(b50))))

b51_0 = ck_df.loc[9].testcases_list.split(' ')
b51_1 = ck_df.loc[10].testcases_list.split(' ')
b51_2 = ck_df.loc[11].testcases_list.split(' ')
b51 = b51_0 + b51_1 + b51_2
print('Amount Different Test Cases - 51 Branch: {}'.format(len(set(b51))))

print('Total Amount TCs Sets Union (len(set(b48) | set(b49) | set(b50) | set(b51))): {}'.format(len(set(b48) | set(b49) | set(b50) | set(b51))))

Amount Different Test Cases - 48 Branch: 46
Amount Different Test Cases - 49 Branch: 66
Amount Different Test Cases - 50 Branch: 26
Amount Different Test Cases - 51 Branch: 66
Total Amount TCs Sets Union (len(set(b48) | set(b49) | set(b50) | set(b51))): 195


## Checking Values [4]

Checking amount of positive and negative links in oracle

In [42]:
oracle = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/firefox_v1/trace_matrix_final.csv')
oracle.set_index('tc_name', inplace=True, drop=True)

print(oracle.shape)

(207, 35336)


In [43]:
num_pos = 0
num_neg = 0

for i in range(len(testcases)):
    counts = oracle.iloc[i, :].value_counts()
    if len(counts) == 2:
        num_neg = num_neg + counts[0]
        num_pos = num_pos + counts[1]
    else:
        num_neg = num_neg + counts[0]
    
print('Num Positive Links: {}'.format(num_pos))
print('Num Negative Links: {}'.format(num_neg))

Num Positive Links: 86434
Num Negative Links: 7228118


## Checking Values [5]

Checking subset of oracle

In [44]:
oracle = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/firefox_v1/trace_matrix_final.csv')
oracle.set_index('tc_name', inplace=True, drop=True)

print(oracle.shape)

(207, 35336)


In [45]:
bugreports_subset_df = bugreports_final[(bugreports_final.Version == '50 Branch') | (bugreports_final.Version == '60 Branch')].sample(15, random_state=42)
bugreports_subset_df[bugreports_subset_df.Version == '50 Branch'].loc[:, ['Bug_Number','Version']].head(100)

Unnamed: 0,Bug_Number,Version
14902,1319983,50 Branch
14763,1318407,50 Branch
12484,1287109,50 Branch
14981,1320548,50 Branch
15367,1325288,50 Branch
12059,1280856,50 Branch


In [46]:
testcases_subset_df = testcases[(testcases.TestDay.str.contains('20161014')) | (testcases.TestDay.str.contains('20161028'))].sample(10, random_state=1000)

selected_testcases = ['TC_{}_TRG'.format(tc_num) for tc_num in [13,14,15,16,17,18,19,20,21]]  # should link with 50 Branch
aux_tc = testcases[testcases.tc_name.isin(selected_testcases)]

tc_subset_df = testcases_subset_df.append(aux_tc)
tc_subset_df.drop_duplicates(inplace=True)

tc_subset_df[tc_subset_df.TestDay.str.contains('20161014')].loc[:,['TC_Number','TestDay']].head(100)

Unnamed: 0,TC_Number,TestDay
18,19,20160603 + 20160624 + 20161014
15,16,20160603 + 20160624 + 20161014
14,15,20160603 + 20160624 + 20161014
12,13,20160603 + 20160624 + 20161014
13,14,20160603 + 20160624 + 20161014
16,17,20160603 + 20160624 + 20161014
17,18,20160603 + 20160624 + 20161014
19,20,20160603 + 20160624 + 20161014
20,21,20160603 + 20160624 + 20161014


In [47]:
testcases_names_subset = tc_subset_df.tc_name
bug_reports_names_subset = bugreports_subset_df.br_name
orc_subset_df = oracle.loc[testcases_names_subset, bug_reports_names_subset]

print('TestCases Subset Shape: {}'.format(tc_subset_df.shape))
print('BugReports Subset Shape: {}'.format(bugreports_subset_df.shape))
print('Oracle Subset Shape: {}'.format(orc_subset_df.shape))

TestCases Subset Shape: (16, 12)
BugReports Subset Shape: (15, 18)
Oracle Subset Shape: (16, 15)


In [48]:
aux_functions.highlight_df(orc_subset_df)

Unnamed: 0_level_0,BR_1441532_SRC,BR_1319983_SRC,BR_1443343_SRC,BR_1464815_SRC,BR_1318407_SRC,BR_1468122_SRC,BR_1445895_SRC,BR_1459431_SRC,BR_1287109_SRC,BR_1320548_SRC,BR_1469153_SRC,BR_1325288_SRC,BR_1463768_SRC,BR_1469753_SRC,BR_1280856_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
TC_165_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TC_19_TRG,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1
TC_152_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TC_16_TRG,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1
TC_160_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TC_169_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TC_145_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TC_15_TRG,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1
TC_149_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TC_167_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
