# Introduction - Hyperparameters Search

In this notebook we demonstrate the use of **LDA (Latent Dirichlet Allocation)** generative statistical model for Information Retrieval technique to make trace link recovery between Test Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each test case content as an entire document that must be returned to the query made

## Import Libraries

In [1]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd
import numpy as np

from sklearn.externals.joblib import Parallel, delayed
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from modules.utils import plots
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import tokenizers as tok
from modules.utils import aux_functions
from modules.utils import model_evaluator as m_eval

from modules.models.lda import LDA
from modules.models.model_hyperps import LDA_Model_Hyperp

from IPython.display import display

import warnings; warnings.simplefilter('ignore')

## Load Dataset

In [2]:
test_cases_df = fd.Datasets.read_testcases_df()
bug_reports_df = fd.Datasets.read_selected_bug_reports_2_df()

corpus = test_cases_df.tc_desc
query = bug_reports_df.br_desc

test_cases_names = test_cases_df.tc_name
bug_reports_names = bug_reports_df.br_name

orc = fd.Tc_BR_Oracles.read_oracle_expert_volunteers_df()

TestCases.shape: (207, 12)
BugReports.shape: (93, 19)
Oracle.shape: (207, 93)


### Select Subset

In [3]:
bugreports_subset_df = bug_reports_df[(bug_reports_df.Version == '48 Branch') | (bug_reports_df.Version == '60 Branch')].sample(15, random_state=42)
testcases_subset_df = test_cases_df[(test_cases_df.TestDay.str.contains('20161014')) | (test_cases_df.TestDay.str.contains('20161028'))].sample(10, random_state=1000)

selected_testcases = ['TC_{}_TRG'.format(tc_num) for tc_num in [13, 14, 15, 16, 17, 18]]  # should link with 48 Branch
aux_tc = test_cases_df[test_cases_df.tc_name.isin(selected_testcases)]

tc_subset_df = testcases_subset_df.append(aux_tc)
tc_subset_df.drop_duplicates(inplace=True)

corpus_subset = tc_subset_df.tc_desc
query_subset = bugreports_subset_df.br_desc
testcases_names_subset = tc_subset_df.tc_name
bug_reports_names_subset = bugreports_subset_df.br_name
orc_subset_df = orc.loc[testcases_names_subset, bug_reports_names_subset]

print('TestCases Subset Shape: {}'.format(tc_subset_df.shape))
print('BugReports Subset Shape: {}'.format(bugreports_subset_df.shape))
print('Oracle Subset Shape: {}'.format(orc_subset_df.shape))

TestCases Subset Shape: (14, 10)
BugReports Subset Shape: (15, 12)
Oracle Subset Shape: (14, 15)


### Find The Best Model

In [4]:
all_hyperparams = {
    LDA_Model_Hyperp.TOP.value : [3,5],
    LDA_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value : [('cosine',.75), ('cosine',.85), ('cosine',.95)] +
                                                         [('jsd', .75), ('jsd', .85), ('jsd', .95)],
    LDA_Model_Hyperp.LDA_MODEL_N_COMPONENTS.value: [5,10,20],
    LDA_Model_Hyperp.LDA_MODEL_RANDOM_STATE.value : [2],
    LDA_Model_Hyperp.VECTORIZER_NGRAM_RANGE.value: [(1,1), (1,2)],
    LDA_Model_Hyperp.VECTORIZER.value : [TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True), 
                         CountVectorizer(stop_words='english')],
    LDA_Model_Hyperp.VECTORIZER_TOKENIZER.value : [tok.PorterStemmerBased_Tokenizer(), tok.LancasterStemmerBased_Tokenizer(), 
                                                   tok.WordNetBased_LemmaTokenizer(), tok.SnowballStemmerBased_Tokenizer()]    
}

hyperparams = aux_functions.generate_params_comb_list(**all_hyperparams)

print('Performing model hyperparameters search...')

def run_model(idx, **hyperp):    
    current_model = LDA(**hyperp)
    current_model.set_name('LDA_Model_{}'.format(idx))
    current_model.recover_links(corpus_subset, query_subset, testcases_names_subset, bug_reports_names_subset)
    
    evaluator = m_eval.ModelEvaluator(orc_subset_df, current_model)
    evaluator.evaluate_model()
    evaluator.dump_model()
    evaluator.dump_evaluator()
        
    return([evaluator.get_mean_precision(), 
            evaluator.get_mean_recall(),
            evaluator.get_mean_fscore(), 
            evaluator.get_model().get_name(),
            evaluator.get_model().get_top_value(),
            evaluator.get_model().get_vectorizer_type(), 
            evaluator.get_model().get_sim_measure_min_threshold()[0],
            evaluator.get_model().get_sim_measure_min_threshold()[1],
            evaluator.get_model().get_model_dump_path(),
            evaluator.get_evaluator_dump_path()
           ])

tasks = [(idx,hp) for idx,hp in enumerate(hyperparams)]
results = Parallel(n_jobs=-1, verbose=1)(delayed(run_model)(idx, **hp) for idx,hp in tasks)
results_df = pd.DataFrame(data=results, 
                          columns=['precision', 'recall', 'fscore', 'model_name', 'top_value', 'vectorizer', 'metric', 'metric_value', 'model_dump', 'evaluator_dump'])
results_df = results_df.astype(dtype={'model_dump' : str, 'evaluator_dump' : str})

Performing model hyperparameters search...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   47.0s
[Parallel(n_jobs=-1)]: Done 576 out of 576 | elapsed:  1.0min finished


### Report

In [5]:
best_model = aux_functions.report_best_model(results_df)

------------ Report -------------------

Total of Analyzed Hyperparameters Combinations: 576

Best Model Hyperparameters Combination Found:

{'Measures': {'Mean FScore of LDA_Model_290': 0.2222222222222222,
              'Mean Precision of LDA_Model_290': 0.26666666666666666,
              'Mean Recall of LDA_Model_290': 0.1904761904761905},
 'Setup': [{'Name': 'LDA_Model_290'},
           {'Similarity Measure and Minimum Threshold': ('cosine', 0.75)},
           {'Top Value': 5},
           {'LDA Model': {'batch_size': 128,
                          'doc_topic_prior': None,
                          'evaluate_every': -1,
                          'learning_decay': 0.7,
                          'learning_method': 'batch',
                          'learning_offset': 10.0,
                          'max_doc_update_iter': 100,
                          'max_iter': 10,
                          'mean_change_tol': 0.001,
                          'n_components': 5,
                       

### Save Similarity Matrix

In [6]:
best_model.save_sim_matrix()

#### Best Model for TOP 3 and 5 - Cosine 0.75

In [7]:
aux_functions.print_report_top_3_and_5_v2(results_df, 0.75, 'cosine')

{'Measures': {'Mean FScore of LDA_Model_2': 0.16,
              'Mean Precision of LDA_Model_2': 0.26666666666666666,
              'Mean Recall of LDA_Model_2': 0.11428571428571428},
 'Setup': [{'Name': 'LDA_Model_2'},
           {'Similarity Measure and Minimum Threshold': ('cosine', 0.75)},
           {'Top Value': 3},
           {'LDA Model': {'batch_size': 128,
                          'doc_topic_prior': None,
                          'evaluate_every': -1,
                          'learning_decay': 0.7,
                          'learning_method': 'batch',
                          'learning_offset': 10.0,
                          'max_doc_update_iter': 100,
                          'max_iter': 10,
                          'mean_change_tol': 0.001,
                          'n_components': 5,
                          'n_jobs': -1,
                          'n_topics': None,
                          'perp_tol': 0.1,
                          'random_state': 2,
             

#### Best Model for TOP 3 and 5 - JSD

In [8]:
aux_functions.print_report_top_3_and_5_v2(results_df, 0.75, 'jsd')

{'Measures': {'Mean FScore of LDA_Model_144': 0.0,
              'Mean Precision of LDA_Model_144': 0.0,
              'Mean Recall of LDA_Model_144': 0.0},
 'Setup': [{'Name': 'LDA_Model_144'},
           {'Similarity Measure and Minimum Threshold': ('jsd', 0.75)},
           {'Top Value': 3},
           {'LDA Model': {'batch_size': 128,
                          'doc_topic_prior': None,
                          'evaluate_every': -1,
                          'learning_decay': 0.7,
                          'learning_method': 'batch',
                          'learning_offset': 10.0,
                          'max_doc_update_iter': 100,
                          'max_iter': 10,
                          'mean_change_tol': 0.001,
                          'n_components': 5,
                          'n_jobs': -1,
                          'n_topics': None,
                          'perp_tol': 0.1,
                          'random_state': 2,
                          'topic_word_pri