# Introduction

Notebook to test a **boosting** model in the context of traceability between features and bug reports.

# Load Libraries and Datasets

In [2]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd
import numpy as np

from modules.models_runner.feat_br_models_runner import Feat_BR_Models_Runner
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import aux_functions
from modules.utils import model_evaluator as m_eval
from modules.utils import similarity_measures as sm

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegressionCV

from imblearn.over_sampling import SMOTE, ADASYN

from enum import Enum
from collections import Counter

import warnings; warnings.simplefilter('ignore')

# Run All Models

## Volunteers Only Strategy

In [3]:
models_runner_4 = Feat_BR_Models_Runner()
lsi_model_4 = models_runner_4.run_lsi_model()
lda_model_4 = models_runner_4.run_lda_model()
bm25_model_4 = models_runner_4.run_bm25_model()
w2v_model_4 = models_runner_4.run_word2vec_model()

Features.shape: (19, 8)
SelectedBugReports.shape: (91, 18)
Running LSI model -----
Running LDA model -----
Running BM25 model -----
Running W2V model -----


# Ensemble Model

## Tranform Results Matrices to Vectors

In [4]:
def transform_sim_matrix_to_sim_vec(sim_matrix_df, model_name):
    sim_vec_df = pd.DataFrame(columns=[model_name])
    for col in sim_matrix_df.columns:
        for idx, row in sim_matrix_df.iterrows():
            artfs_names = '{}_{}'.format(idx, col)
            sim_vec_df.at[artfs_names, model_name] = row[col]
    return sim_vec_df

sim_vec_lsi = transform_sim_matrix_to_sim_vec(lsi_model_4.get_sim_matrix(), 'lsi')
sim_vec_lda = transform_sim_matrix_to_sim_vec(lda_model_4.get_sim_matrix(), 'lda')
sim_vec_bm25 = transform_sim_matrix_to_sim_vec(bm25_model_4.get_sim_matrix(), 'bm25')
sim_vec_wv = transform_sim_matrix_to_sim_vec(w2v_model_4.get_sim_matrix(), 'wv')

## Transform Vectors to DataFrame

In [5]:
ensemble_input_df = pd.DataFrame(columns=['pred'], index=sim_vec_lsi.index)

out_df = pd.merge(ensemble_input_df, sim_vec_lsi, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_lda, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_bm25, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_wv, left_index=True, right_index=True)

new_order = [1,2,3,4,0]
out_df = out_df[out_df.columns[new_order]]

ensemble_input_df = out_df.copy()
ensemble_input_df.head()

Unnamed: 0,lsi,lda,bm25,wv,pred
new_awesome_bar_1248267,0.319831,0.819873,34.8719,0.936612,
windows_child_mode_1248267,0.0576609,0.38372,13.1232,0.901049,
apz_async_scrolling_1248267,0.00131195,0.555395,0.431554,0.879576,
browser_customization_1248267,0.0284819,0.084284,1.94456,0.808218,
pdf_viewer_1248267,0.00595923,0.181913,0.908809,0.808188,


## Insert Oracle Data

In [6]:
orc_vec_df = transform_sim_matrix_to_sim_vec(fd.Feat_BR_Oracles.read_feat_br_volunteers_df().T, 'oracle')

ensemble_input_df = pd.merge(ensemble_input_df, orc_vec_df, left_index=True, right_index=True)

new_order = [0,1,2,3,5,4]
ensemble_input_df = ensemble_input_df[ensemble_input_df.columns[new_order]]

ensemble_input_df.head(15)

Feat_BR Volunteers Matrix shape: (91, 19)


Unnamed: 0,lsi,lda,bm25,wv,oracle,pred
new_awesome_bar_1248267,0.319831,0.819873,34.8719,0.936612,0.0,
windows_child_mode_1248267,0.0576609,0.38372,13.1232,0.901049,0.0,
apz_async_scrolling_1248267,0.00131195,0.555395,0.431554,0.879576,0.0,
browser_customization_1248267,0.0284819,0.084284,1.94456,0.808218,0.0,
pdf_viewer_1248267,0.00595923,0.181913,0.908809,0.808188,0.0,
context_menu_1248267,0.974364,0.817121,66.323,0.939618,1.0,
w10_comp_1248267,0.190952,0.820732,16.6925,0.903249,0.0,
tts_in_desktop_1248267,0.0197604,0.0569175,1.71863,0.889628,0.0,
tts_in_rm_1248267,0.0262042,0.0571287,1.87935,0.887032,0.0,
webgl_comp_1248267,0.0180232,0.823955,1.70011,0.79095,0.0,


## Balancing Dataset and Split Data on Train and Test

In [7]:
ensemble_input_df = ensemble_input_df.infer_objects()

X = ensemble_input_df[['lsi','lda','bm25','wv']]
y = ensemble_input_df['oracle']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

X_train, y_train = SMOTE().fit_resample(X_train, y_train)
print("Train SMOTE: {}".format(sorted(Counter(y_train).items())))

X_train = pd.DataFrame(X_train, columns=['lsi','lda','bm25','wv'])
y_train = pd.DataFrame(y_train)

(1383, 4) (1383,) (346, 4) (346,)
Train SMOTE: [(0.0, 1310), (1.0, 1310)]


## Discretizer Function

In [8]:
def discretizer(x):
    return 0 if x < 0.5 else 1

## Logistic Regressor

In [9]:
ensemb_model = LogisticRegressionCV(cv=3, scoring='recall').fit(X_train, y_train)

preds = ensemb_model.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))
    
precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data: {:2.3%}'.format(recall))
print('Precision - Test Data: {:2.3%}'.format(precision))
print('F-Score  - Test Data: {:2.3%}'.format(fscore))

print(ensemb_model.coef_)

Recall - Test Data: 60.000%
Precision - Test Data: 14.286%
F-Score  - Test Data: 23.077%
[[ 2.97455201 -0.07044734  0.0354807  -7.0425815 ]]


# Test with Other Model Types

## XGBoost

In [11]:
from xgboost import XGBClassifier

xgb = XGBClassifier(seed=42).fit(X_train, y_train)

preds = xgb.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data: {:2.3%}'.format(recall))
print('Precision - Test Data: {:2.3%}'.format(precision))
print('F-Score - Test Data: {:2.3%}'.format(fscore))

print(xgb.feature_importances_)

Recall - Test Data: 55.000%
Precision - Test Data: 15.942%
F-Score - Test Data: 24.719%
[0.3989541  0.21247967 0.19149093 0.19707526]


## Extra Trees Classifier

In [12]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(random_state=42).fit(X_train, y_train)

preds = et.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data {:2.3%}'.format(recall))
print('Precision - Test Data: {:2.3%}'.format(precision))
print('F-Score - Test Data: {:2.3%}'.format(fscore))

print(et.feature_importances_)

Recall - Test Data 30.000%
Precision - Test Data: 10.909%
F-Score - Test Data: 16.000%
[0.29441487 0.2049435  0.27618138 0.22446025]
