# Introduction

Notebook to test a **boosting** model in the context of traceability between test cases and bug reports.

# Load Libraries and Datasets

In [1]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd
import numpy as np

from modules.models_runner.tc_br_models_runner import TC_BR_Runner
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import aux_functions
from modules.utils import model_evaluator as m_eval
from modules.utils import similarity_measures as sm

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegressionCV

from imblearn.over_sampling import SMOTE, ADASYN

from enum import Enum
from collections import Counter

import warnings; warnings.simplefilter('ignore')

# Run All Models

## Volunteers Only Strategy

In [2]:
models_runner_4 = TC_BR_Runner()
lsi_model_4 = models_runner_4.run_lsi_model()
lda_model_4 = models_runner_4.run_lda_model()
bm25_model_4 = models_runner_4.run_bm25_model()
w2v_model_4 = models_runner_4.run_word2vec_model()

TestCases.shape: (195, 12)
SelectedBugReports.shape: (91, 18)
Running LSI Model ------
Running LDA Model -----
Running BM25 Model -----
Running W2V Model ------


# Ensemble Model

## Tranform Results Matrices to Vectors

In [3]:
def transform_sim_matrix_to_sim_vec(sim_matrix_df, model_name):
    sim_vec_df = pd.DataFrame(columns=[model_name])
    for col in sim_matrix_df.columns:
        for idx, row in sim_matrix_df.iterrows():
            artfs_names = '{}_{}'.format(idx, col)
            sim_vec_df.at[artfs_names, model_name] = row[col]
    return sim_vec_df

sim_vec_lsi = transform_sim_matrix_to_sim_vec(lsi_model_4.get_sim_matrix(), 'lsi')
sim_vec_lda = transform_sim_matrix_to_sim_vec(lda_model_4.get_sim_matrix(), 'lda')
sim_vec_bm25 = transform_sim_matrix_to_sim_vec(bm25_model_4.get_sim_matrix(), 'bm25')
sim_vec_wv = transform_sim_matrix_to_sim_vec(w2v_model_4.get_sim_matrix(), 'wv')

## Transform Vectors to DataFrame

In [4]:
ensemble_input_df = pd.DataFrame(columns=['pred'], index=sim_vec_lsi.index)

out_df = pd.merge(ensemble_input_df, sim_vec_lsi, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_lda, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_bm25, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_wv, left_index=True, right_index=True)

new_order = [1,2,3,4,0]
out_df = out_df[out_df.columns[new_order]]

ensemble_input_df = out_df.copy()
ensemble_input_df.head()

Unnamed: 0,lsi,lda,bm25,wv,pred
TC_13_TRG_BR_1248267_SRC,0.160448,0.126582,5.40608,0.914992,
TC_14_TRG_BR_1248267_SRC,0.175316,0.317495,11.2707,0.947215,
TC_15_TRG_BR_1248267_SRC,0.0816225,0.10915,3.63892,0.919391,
TC_16_TRG_BR_1248267_SRC,0.168325,0.113487,25.3307,0.927666,
TC_17_TRG_BR_1248267_SRC,0.195552,0.110505,21.3448,0.911958,


## Insert Oracle Data

In [5]:
orc_vec_df = transform_sim_matrix_to_sim_vec(fd.Tc_BR_Oracles.read_oracle_volunteers_df(), 'oracle')

ensemble_input_df = pd.merge(ensemble_input_df, orc_vec_df, left_index=True, right_index=True)

new_order = [0,1,2,3,5,4]
ensemble_input_df = ensemble_input_df[ensemble_input_df.columns[new_order]]

ensemble_input_df.head(15)

OracleVolunteers.shape: (195, 91)


Unnamed: 0,lsi,lda,bm25,wv,oracle,pred
TC_13_TRG_BR_1248267_SRC,0.160448,0.126582,5.40608,0.914992,0.0,
TC_14_TRG_BR_1248267_SRC,0.175316,0.317495,11.2707,0.947215,0.0,
TC_15_TRG_BR_1248267_SRC,0.0816225,0.10915,3.63892,0.919391,0.0,
TC_16_TRG_BR_1248267_SRC,0.168325,0.113487,25.3307,0.927666,0.0,
TC_17_TRG_BR_1248267_SRC,0.195552,0.110505,21.3448,0.911958,0.0,
TC_18_TRG_BR_1248267_SRC,0.066099,0.107986,3.78247,0.902639,0.0,
TC_19_TRG_BR_1248267_SRC,0.118581,0.127082,7.96864,0.919673,0.0,
TC_20_TRG_BR_1248267_SRC,0.101477,0.105766,5.56926,0.918608,0.0,
TC_21_TRG_BR_1248267_SRC,0.103599,0.103972,5.7166,0.922776,0.0,
TC_22_TRG_BR_1248267_SRC,0.101755,0.186272,5.87195,0.921291,0.0,


## Balancing Dataset and Split Data on Train and Test

In [6]:
ensemble_input_df = ensemble_input_df.infer_objects()

X = ensemble_input_df[['lsi','lda','bm25','wv']]
y = ensemble_input_df['oracle']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

X_train, y_train = SMOTE().fit_resample(X_train, y_train)
print("Train SMOTE: {}".format(sorted(Counter(y_train).items())))

X_train = pd.DataFrame(X_train, columns=['lsi','lda','bm25','wv'])
y_train = pd.DataFrame(y_train)

(14196, 4) (14196,) (3549, 4) (3549,)
Train SMOTE: [(0.0, 13247), (1.0, 13247)]


## Discretizer Function

In [7]:
def discretizer(x):
    return 0 if x < 0.5 else 1

## Logistic Regressor

In [8]:
ensemb_model = LogisticRegressionCV(cv=3, scoring='recall').fit(X_train, y_train)

preds = ensemb_model.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))
    
precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data: {:2.3%}'.format(recall))
print('Precision - Test Data: {:2.3%}'.format(precision))
print('F-Score  - Test Data: {:2.3%}'.format(fscore))

print(ensemb_model.coef_)

Recall - Test Data: 59.375%
Precision - Test Data: 16.540%
F-Score  - Test Data: 25.872%
[[ 1.99360827  1.12402267  0.01687037 -0.23420447]]


# Test with Other Model Types

## XGBoost

In [9]:
from xgboost import XGBClassifier

xgb = XGBClassifier(seed=42).fit(X_train, y_train)

preds = xgb.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data: {:2.3%}'.format(recall))
print('Precision - Test Data: {:2.3%}'.format(precision))
print('F-Score - Test Data: {:2.3%}'.format(fscore))

print(xgb.feature_importances_)

Recall - Test Data: 58.984%
Precision - Test Data: 21.821%
F-Score - Test Data: 31.857%
[0.6021871  0.16914523 0.110857   0.11781062]


## Extra Trees Classifier

In [10]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(random_state=42).fit(X_train, y_train)

preds = et.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data {:2.3%}'.format(recall))
print('Precision - Test Data: {:2.3%}'.format(precision))
print('F-Score - Test Data: {:2.3%}'.format(fscore))

print(et.feature_importances_)

Recall - Test Data 58.594%
Precision - Test Data: 22.222%
F-Score - Test Data: 32.223%
[0.30758181 0.25412486 0.25001155 0.18828178]
