# Load all packages

In [1]:
# Install the latest Tensorflow version.
# !pip3 install --upgrade tensorflow-gpu
# Install TF-Hub.
# !pip3 install tensorflow-hub
# !pip3 install seaborn

In [2]:
from absl import logging
import sys
import tensorflow as tf
import tensorflow_hub as hub

import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

import bisect
from numba import jit 

import gensim
import struct
import binascii
import bisect
import nltk
from nltk.tokenize import word_tokenize
from collections import OrderedDict

import time

from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import euclidean_distances

In [3]:
sys.path.append('../utility')
from text_preprocessing import *
from word2vec_def_ref import *
from transformer_def_ref import *
from tfidf_def_ref import *

In [4]:
DATA_DIR = '../data/'
INPUT_DATA_FILE = 'angularjs_processed_withimagetext.csv'
OUTPUT_DATA_FILE = 'angularjs_w2v.csv'
MODEL_DIR = '../models/'
MODEL_FILES = ['glove-wiki-gigaword-300.bin','word2vec-google-news-300.bin','fasttext-wiki-news-subwords-300.bin']
W2V_COLS = ['W2V_glove','W2V_google','W2V_fasttext']
VECTOR_SIZE = 300

INPUT_DATA_DIR = '../data/model_perf/'
INPUT_DATA_FILE = 'angularjs_w2v.csv'

KEYS = ['W2V_glove','W2V_google','W2V_fasttext']
MAX_SIMILAR = 5
VECTOR_SIZE = 300

In [5]:
# for transformer
TR_DATA_DIR = '../data/model_perf/'
TR_INPUT_FILE = 'angularjs_processed_withimagetext.csv'
TR_OUTPUT_FILE = 'transformer_vec.csv'
TR_OUTPUT_DATA_FILE ='transformer_cos_sim.csv'
TR_MAX_SIMILAR = 5
TR_VECTOR_SIZE = 768
TR_model = 'Transformer_vector'

In [6]:
TFIDF_DATA_DIR = '../data/model_perf/'
TFIDF_INPUT_FILE = 'angularjs_processed_withimagetext.csv'
TFIDF_OUTPUT_FILE = 'tfidf_vec.csv'
TFIDF_OUTPUT_DATA_FILE ='tfidf_cos_sim.csv'
TFIDF_MAX_SIMILAR = 5
TFIDF_VECTOR_SIZE = 300
TFIDF_MODEL = 'tfidf_vector'

# Define functions

In [7]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [8]:
#@jit(nopython=True)
def find_sim_score(df,modelname,vector='default',issue_to_compare=4555):
    temp_dict ={}
    model_output_dic ={}
#     issue_to_compare=4555
   
    if modelname in ('W2V_glove','W2V_google','W2V_fasttext','Transformer_vector','tfidf_vector'):
        vectdict = {}
        zeroVec = list(np.zeros((VECTOR_SIZE,), dtype="float32"))
        for indx in df.index:
            binval = binascii.a2b_base64(df[modelname][indx])
            val = np.array(struct.unpack('f'*VECTOR_SIZE, binval))
            if list(val) == zeroVec:
                continue
            vectdict[df['Number'][indx]] = val

        for key in vectdict.keys():
            if key != issue_to_compare:
#                 print(vectdict[Issue_to_Compare].shape)
#                 print(vectdict[key].shape)
                sim = cosine_similarity_numba(vectdict[issue_to_compare], vectdict[key])
                temp_dict.update({key: round(sim,5)})
                
    else:
        index_lst = df.index[df['Number'].isin([issue_to_compare])].tolist()
#         print(index_lst)

        for key ,vec in zip(df.Number,vector):

                if key != issue_to_compare:
                    if modelname == 'tfidf':
                        sim = cosine_similarity(tfidf_vectors[index_lst[0]], vec)
                        temp_dict.update({key: round(sim[0][0],5)})
                    else:
                        sim = cosine(vector[index_lst[0]],vec)
                        temp_dict.update({key: round(sim,5)})
     
    model_output_dic.update({modelname:temp_dict})
    return(model_output_dic)

In [9]:
# model_comparison_lst : list of dic of dic
def create_consolidated_df(model_comparison_lst):
    df_consolidated = None
    for dic in model_comparison_lst:
        if df_consolidated is None:
            df_consolidated = pd.DataFrame.from_dict(dic, orient="index").reset_index()
        else:
            temp_df = pd.DataFrame.from_dict(dic, orient="index").reset_index()
            df_consolidated = pd.concat([df_consolidated, temp_df])

    df_final_size = df_consolidated.shape[1]-1
#     print(df_final_size)
    return(df_consolidated,df_final_size)

In [10]:
def create_empty_df(size,cols = ['models']):
    #create empty dataframe for storing sorted defects for each model
    for i in range(size):
        cols = cols + ['defectnum_' + str(i)] + ['defect_score_' + str(i)]
    df = pd.DataFrame()
    df[cols] = None
    return (df)

In [11]:
def create_final_sorted_consolidated_df(df_size,model_comparison_lst):
    df = create_empty_df(df_size)
    models_lst =[]

    for row in range(len(model_comparison_lst)):
        for key in model_comparison_lst[row].keys():
            models_lst.append(key)
    print(models_lst)

    df['models'] = models_lst
    
    for indx in (df.index):
    #     print(indx)
        for key, dict in model_comparison_lst[indx].items():

            simScore = []
            simNumber = []

            models_lst.append(key)
    #         print(f'original dict : {dict}')
            d_sorted_by_value = OrderedDict(sorted(dict.items(), key=lambda x: x[1],reverse = True))
    #         print(len(d_sorted_by_value))

            for k, v in d_sorted_by_value.items():
                simNumber.append(k)
                simScore.append(v)

            for i in range(len(d_sorted_by_value)):
                df['defectnum_'  + str(i)][indx] = simNumber[i]
                df['defect_score_' + str(i)][indx] = simScore[i]
            print(d_sorted_by_value)
    return(df)

# Load and create filtered Dataset

In [12]:
df = pd.read_csv('../data/angularjs_processed_withimagetext.csv')
columns = ['Title_new','Description_new','AttachmentText_new','Comments_new']
# remove NAN 
count_nan_values(df,columns,"")
# df['Number'] = pd.to_numeric(df['Number'])

df[columns].tail(3)

Title_new has 88 NAN values
Description_new has 1604 NAN values
AttachmentText_new has 16626 NAN values
Comments_new has 1451 NAN values

 ------- 

Title_new has 0 NAN values
Description_new has 0 NAN values
AttachmentText_new has 0 NAN values
Comments_new has 0 NAN values


Unnamed: 0,Title_new,Description_new,AttachmentText_new,Comments_new
17042,script tag version compatible jquery,note post jusing jquery execute script tag partial template try stripe embed checkout feature script tag require use embed form stripe server example script tag execute jquery investigate issue jquery execute script tag condition meet evaluate executable script document insertionfor hasscript node scripts node globaleval doc node,improve doc view source function module ng overview wrap raw dom element html string jquery element jquery available alias jquery function jquery available delegate angularbuilt subset jquery call jquery lite jqlite jqlite tiny api compatible subset jquery allow angular manipulate dom cross browser compatible way jqlite implement commonly need functionality goal small footprint use jquery simply ensure load angular js file use ngjq directive specify jqlite jquery use specific version jquery multiple version exist page note element reference angular wrap jquery jqlite element argument directivecompile link function raw dom reference note mind function element tag css selector lookup tag try instead document use standard dom apis document queryselectorall angularjqlite jqlite provide follow jquery method addclass support function argument append attr support function parameter bind deprecate use support namespace selector eventdata child support selector clone content xc status type initiator size time waterfall mas kb mst oth heanetinn ecuman â€œr kb ms tent injeciyo ms jsfi disk cach gppongmhjkpfnbhagpmijfkannfbllamg js ok sorip script disk cache ms xc,duplicate
17043,jquery conflict browser script tagging,rcleanscript problem condition evaluate doc node believe issue introduce animation create document actually contain template ng include currentelement null element response time jquery try evaluate script decide document actually contain element currentelement null element document contain element dom,improve doc view source function module ng overview wrap raw dom element html string jquery element jquery available alias jquery function jquery available delegate angularbuilt subset jquery call jquery lite jqlite jqlite tiny api compatible subset jquery allow angular manipulate dom cross browser compatible way jqlite implement commonly need functionality goal small footprint use jquery simply ensure load angular js file use ngjq directive specify jqlite jquery use specific version jquery multiple version exist page note element reference angular wrap jquery jqlite element argument directivecompile link function raw dom reference note mind function element tag css selector lookup tag try instead document use standard dom apis document queryselectorall angularjqlite jqlite provide follow jquery method addclass support function argument append attr support function parameter bind deprecate use support namespace selector eventdata child support selector clone content xc status type initiator size time waterfall mas kb mst oth heanetinn ecuman â€œr kb ms tent injeciyo ms jsfi disk cach gppongmhjkpfnbhagpmijfkannfbllamg js ok sorip script disk cache ms xc,
17044,java script fail unstable version jquery critical issue report,note post jusing jquery execute script tag partial template,improve doc view source function module ng overview wrap raw dom element html string jquery element jquery available alias jquery function jquery available delegate angularbuilt subset jquery call jquery lite jqlite jqlite tiny api compatible subset jquery allow angular manipulate dom cross browser compatible way jqlite implement commonly need functionality goal small footprint use jquery simply ensure load angular js file use ngjq directive specify jqlite jquery use specific version jquery multiple version exist page note element reference angular wrap jquery jqlite element argument directivecompile link function raw dom reference note mind function element tag css selector lookup tag try instead document use standard dom apis document queryselectorall angularjqlite jqlite provide follow jquery method addclass support function argument append attr support function parameter bind deprecate use support namespace selector eventdata child support selector clone content xc status type initiator size time waterfall mas kb mst oth heanetinn ecuman â€œr kb ms tent injeciyo ms jsfi disk cach gppongmhjkpfnbhagpmijfkannfbllamg js ok sorip script disk cache ms xc,thank pull request look contribution google open source project look pull request need sign contributor license agreement cla information open cla check pull request


In [13]:
# defects =[17180,9996,14427,4749,5021,10450,8490,9954,12926,10421,1646,7813,11416,5706,3558,9027,7391,16877,
#           7134,15887,12187,12496,6548,9092,2258]
defects =['4555','4598','4596','4394','4462','19100','19101','19102','19103','7134']
# df_new = df[df['Number'].isin([defects])] 
    
df_new = df[df['Number'].isin(defects)]
df_new.shape

(10, 21)

In [14]:
df_new.shape

(10, 21)

In [15]:
df_new['IssueText'] = df_new[columns].agg(' '.join, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['IssueText'] = df_new[columns].agg(' '.join, axis=1)


In [16]:
df_new.columns


Index(['Unnamed: 0', 'Id', 'Number', 'Title', 'LabelsNames',
       'LabelDescriptions', 'State', 'CreatedDate', 'ClosedDate', 'IsDraft',
       'IssueType', 'Description', 'StateReason', 'AttachmentText',
       'Duplicate', 'Comments', 'SimilarityScore', 'Title_new',
       'AttachmentText_new', 'Description_new', 'Comments_new', 'IssueText'],
      dtype='object')

In [17]:
df_new.head(2)

Unnamed: 0.1,Unnamed: 0,Id,Number,Title,LabelsNames,LabelDescriptions,State,CreatedDate,ClosedDate,IsDraft,...,StateReason,AttachmentText,Duplicate,Comments,SimilarityScore,Title_new,AttachmentText_new,Description_new,Comments_new,IssueText
9912,9912,31646776,7134,Prevent Ghost Cliking for old browser ( like android 2.3 ) with ngTouch,type: bug;frequency: low;cla: yes;component: ngTouch;needs: test;severity: broken expected use;,,closed,2014-04-16T14:35:02Z,2016-01-27T19:25:38Z,N,...,,,,Thanks for the PR! Please check the items below to help us merge this faster. See the [contributing docs](https://github.com/angular/angular.js/blob/master/CONTRIBUTING.md#contributing-to-angularjs) for more information.- [ ] Uses the issue template ([#7134](http://issuetemplate.com/#/angular/angular.js/issue/7134))If you need to make changes to your pull request you can update the commit with git commit --amend.Then update the pull request with git push -f.Thanks again for your help!#;;+1#;;Im sorry but I wasnt able to verify your Contributor License Agreement (CLA) signature. CLA signature is required for any code contributions to AngularJS.Please [sign our CLA](https://github.com/angular/angular.js/blob/master/CONTRIBUTING.md#signing-the-cla) and _ensure that the CLA signature email address and the email address in this PRs commits match_. If you signed the CLA as a corporation please let us know the companys name.Thanks a bunch! PS: If you signed the CLA in the past then most likely the email addresses dont match. Please sign the CLA again or update the email address in the commit of this PR. PS2: If you are a Googler please sign the CLA as well to simplify the CLA verification process.#;;CLA signature verified! Thank you!Someone from the team will now triage your PR and it will be processed based on the determined priority (doc updates and fixes with tests are prioritized over other changes).#;;,,prevent ghost clike old browser android ngtouch,,request type bughow reproduce bug android ngtouch exemple ngtouch ngtouchpatchedngtouchimpact smallcomplexity smallthis issue relate detailed description comment prevent ghost clike old browser support stopimmediatepropagation android ghost click appear click element select element bug android ngtouch exemple ngtouch ngtouchpatche,thank pr check item help merge fast contribute doc information use issue template angular need change pull request update commit git commit update pull request git push help m sorry able verify contributor license agreement cla signature cla signature require code contribution sign cla ensure cla signature email address email address prs commits match sign cla corporation let know companys bunch ps sign cla past likely email address match sign cla update email address commit pr ps googler sign cla simplify cla verification process cla signature verify thank team triage process base determined priority doc update fix test prioritize change,prevent ghost clike old browser android ngtouch request type bughow reproduce bug android ngtouch exemple ngtouch ngtouchpatchedngtouchimpact smallcomplexity smallthis issue relate detailed description comment prevent ghost clike old browser support stopimmediatepropagation android ghost click appear click element select element bug android ngtouch exemple ngtouch ngtouchpatche thank pr check item help merge fast contribute doc information use issue template angular need change pull request update commit git commit update pull request git push help m sorry able verify contributor license agreement cla signature cla signature require code contribution sign cla ensure cla signature email address email address prs commits match sign cla corporation let know companys bunch ps sign cla past likely email address match sign cla update email address commit pr ps googler sign cla simplify cla verification process cla signature verify thank team triage process base determined priority doc update fix test prioritize change
12447,12447,21435391,4598,1.2.0-rc3 regression - $animate.enter uses raw DOM manipulation leaving script blocks unexecuted even if jQuery is included,resolution: duplicate;,,closed,2013-10-23T06:12:41Z,2014-01-09T00:41:12Z,N,...,completed,,Y,Duplicate of #4555.#;;,,regression use raw dom manipulation leave script block unexecuted jquery include,,raw dom manipulation attach dom fragment instead jqlite jquery available enter function element parent var afternode var parentnode parent parent afternode undefined pass null var afternextsible afternode null foreach element function node node afternextsibling timeout false cause problem case rely understand jquery specific behavior automatic execution script block attach document nasty surprise change function use jqlite jquery wrapper restore behavior previous version,duplicate,regression use raw dom manipulation leave script block unexecuted jquery include raw dom manipulation attach dom fragment instead jqlite jquery available enter function element parent var afternode var parentnode parent parent afternode undefined pass null var afternextsible afternode null foreach element function node node afternextsibling timeout false cause problem case rely understand jquery specific behavior automatic execution script block attach document nasty surprise change function use jqlite jquery wrapper restore behavior previous version duplicate


In [18]:
df_new.to_csv('../data/model_perf/original_filtered_defects.csv', index=False, encoding='utf-8')

In [19]:
df2 = pd.read_csv('../data/model_perf/original_filtered_defects.csv', encoding='utf-8')

In [20]:
df2 = df2[['Number','IssueText']]
df2.reset_index(drop=True, inplace=True)

In [21]:
df2.head(2)

Unnamed: 0,Number,IssueText
0,7134,prevent ghost clike old browser android ngtouch request type bughow reproduce bug android ngtouch exemple ngtouch ngtouchpatchedngtouchimpact smallcomplexity smallthis issue relate detailed description comment prevent ghost clike old browser support stopimmediatepropagation android ghost click appear click element select element bug android ngtouch exemple ngtouch ngtouchpatche thank pr check item help merge fast contribute doc information use issue template angular need change pull request update commit git commit update pull request git push help m sorry able verify contributor license agreement cla signature cla signature require code contribution sign cla ensure cla signature email address email address prs commits match sign cla corporation let know companys bunch ps sign cla past likely email address match sign cla update email address commit pr ps googler sign cla simplify cla verification process cla signature verify thank team triage process base determined priority doc update fix test prioritize change
1,4598,regression use raw dom manipulation leave script block unexecuted jquery include raw dom manipulation attach dom fragment instead jqlite jquery available enter function element parent var afternode var parentnode parent parent afternode undefined pass null var afternextsible afternode null foreach element function node node afternextsibling timeout false cause problem case rely understand jquery specific behavior automatic execution script block attach document nasty surprise change function use jqlite jquery wrapper restore behavior previous version duplicate


In [22]:
df2.to_csv('../data/model_perf/filtered_defects.csv', index=False, encoding='utf-8')

# create vectors from different models

## word2vec vectorization

In [23]:
df = pd.read_csv('../data/model_perf/filtered_defects.csv')

In [24]:
df.shape

(10, 2)

In [25]:
df_word2vec = calcVectors(df,W2V_COLS,MODEL_DIR,MODEL_FILES)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  db[W2V_COLS[i]][indx] = valStr


In [26]:
df_word2vec.to_csv('../data/model_perf/word2vec_filtered_defects.csv', index=False, encoding='utf-8')


## universal encoder - vectorization

In [27]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
uni_enc_model = hub.load(module_url)
print (f'module {module_url} loaded' )

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [28]:
def uni_enc_model_embedding(input):
  return uni_enc_model(input)

In [29]:
# Reduce logging output.
logging.set_verbosity(logging.ERROR)
combined_IssueText_embeddings = uni_enc_model_embedding(df['IssueText'])
combined_IssueText_embeddings.shape

TensorShape([10, 512])

In [30]:
for i, combined_IssueText_embedding in enumerate(np.array(combined_IssueText_embeddings).tolist()[0:2]):
  print("defect : {} ".format(df['Number'][i]))
#   print("IssuesText : {} ".format(df2['IssueText'][i]))
  print("Embedding size: {}".format(len(combined_IssueText_embedding)))
  print(f'Embedding embedding snippet: {[", ".join(str(x) for x in combined_IssueText_embedding[:2])]} \n')

defect : 7134 
Embedding size: 512
Embedding embedding snippet: ['-0.030751705169677734, -0.055367063730955124'] 

defect : 4598 
Embedding size: 512
Embedding embedding snippet: ['0.004607335664331913, -0.05738189071416855'] 



## tfidf - vectorization

In [31]:
# tfidfvectoriser=TfidfVectorizer()
# tfidf_vectors= tfidfvectoriser.fit_transform(df_new['IssueText'])

In [32]:
df = pd.read_csv('../data/model_perf/filtered_defects.csv')
df.columns

Index(['Number', 'IssueText'], dtype='object')

In [33]:
df_tfidf = calculate_tfidf_vectors(df,TFIDF_MODEL,TFIDF_VECTOR_SIZE)

sanitizing Vectors (zero vector check)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  db[model][i] = valStr


In [34]:
df_tfidf.head(2)

Unnamed: 0,Number,tfidf_vector
0,7134,AAAAAAAAAAAAAAAAAAAAAAWCbj4AAAAAAAAAAAAAAAAAAAAAAAAAAOXASj0AAAAAAAAAAAAAAAAAAAAABYJuPo+G6DwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABStZ09UrWdPQWCbj0AAAAA5cBKPQAAAAAAAAAAAAAAAAAAAADlwMo95cBKPQAAAAAAAAAAAAAAAB5x/T4Fgu49BYLuPQAAAAAAAAAA5cBKPQAAAACD4TI+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOXASj3lwEo95cBKPQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACPhmg9AAAAAAAAAAAAAAAAAAAAAI+GaD0Fgm4+AAAAAAAAAABStR09AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAWC7j0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAApmIxPQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAg+EyPgWC7j0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAUrWdPQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADlwEo9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADNwFM9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA5cBKPQAAAADlwEo9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABYLuPQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABStR09AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQxGVPgAAAAAAAAAAAAAAAAAAAAAFgu49AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA5cDKPeXAyj0AAAAABYLuPQAAAAAFgu495cDKPQAAAAAAAAAAAAAAAAAAAAAAAAAA5cBKPQAAAAD9CQU+UrUdPQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFgm49AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHnF9PgWCbj4Fgm49AAAAAAAAAAAAAAAABYJuPQWCbj0AAAAAAAAAAAAAAAAFgm49AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFK1HT0AAAAAj4boPAAAAADlwEo9qwmAPQAAAAAAAAAAAAAAAAAAAAAFgm49AAAAAAAAAACGoQ09AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFgm4+AAAAAOcrwTwAAAAAAAAAAAAAAAAFgm49BYLuPQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
1,4598,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAH5P9T1+T/U9nvu3PgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAH5PdT4AAAAAirCDPU6JUD4AAAAAAAAAAAAAAABOiVA+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE6J0D1OidA9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAWihvPc+IRT7VNKI9AAAAAFoo7z0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE6J0D0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAfk/1PQAAAAAAAAAAQ14zPgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIqwgz0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADVNKI9AAAAAAAAAAAAAAAA1TQiPoqwgz4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAH5PdT4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKarET4AAAAA1TQiPgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACe+7c+AAAAAAAAAAAAAAAAAAAAAAAAAACKsIM9AAAAAAAAAAAAAAAAAAAAAAAAAADVNCI+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAB+T/U9AAAAAAAAAAAAAAAAAAAAAOzK2T0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADVNKI9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAB+T/U9AAAAAH5P9T1+T/U9AAAAAAAAAAAAAAAAAAAAAFmuxj0AAAAAAAAAAPpmnD4AAAAAAAAAANU0oj0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAH5P9T0AAAAAAAAAAAAAAAAAAAAA


## transformer vectorization

In [35]:
df = pd.read_csv('../data/model_perf/filtered_defects.csv')
df.head(1)

Unnamed: 0,Number,IssueText
0,7134,prevent ghost clike old browser android ngtouch request type bughow reproduce bug android ngtouch exemple ngtouch ngtouchpatchedngtouchimpact smallcomplexity smallthis issue relate detailed description comment prevent ghost clike old browser support stopimmediatepropagation android ghost click appear click element select element bug android ngtouch exemple ngtouch ngtouchpatche thank pr check item help merge fast contribute doc information use issue template angular need change pull request update commit git commit update pull request git push help m sorry able verify contributor license agreement cla signature cla signature require code contribution sign cla ensure cla signature email address email address prs commits match sign cla corporation let know companys bunch ps sign cla past likely email address match sign cla update email address commit pr ps googler sign cla simplify cla verification process cla signature verify thank team triage process base determined priority doc update fix test prioritize change


In [36]:
df_trans = create_transformer_vec(df,TR_DATA_DIR,TR_OUTPUT_FILE)

total time taken 2.7602360248565674
(10, 768)


In [37]:
df_trans.head(2)

Unnamed: 0,Number,Transformer_vector
0,7134,SIDBvkdRoD8iimY/eHwyvkTncj/p0tO9yBOSP5v/or7xDG8++pC7vhMpAL/POWU/V66evvhWwL1q1lG/Ki30PoPEv77IZjg9GGVZPfy+0L3oPMq+4yQgvxtolD7OmRI+dIRVPx3KCD/A26g+tkohP+/01b+PsWs+qHAgvwIftD7LnCK+tMAav1r9Qr8HPZM/jaTyPZS1iD0AHko/JPugPzS0bD2wkHq+5EUfP244FT9RryC/LUAzvj0kFb/5QRg/N3PSPmPs7L5YWYk+9pCaP/ZYqz/agyw+F3VivmaOGz17Bf4+REmyv4bbHL85QrC+3HS4v+DHwz4+p6Y+Jay6PpxSpb90qLe+5i3BvQRelD4Ki5q/Ca1QvoK85T4SJ4K/cC6GvwpwwD4PPQE/+RCgPX2zl74XiCc/uZtfP9Db7z6Jprk8dyzIPKtRwj3K4hS/zXdhP9A0xDs1848/YBoEP2p4h79r5TA/U3lUPzfTsz7ZIyE/RoOPvlWH3b04y9G9ulCTvo/FGz2nAmA9GMlUP6RMvb2KQm+9uRr2viDwOT+E+Du//k5PvjV2YD1fqx6/NkiRv6+Eqz0EEIE/cZpuPwa5pL4go3M983dWPnZ1fL/5x7i/fiE7PvI4fT9gz6U+vvwpPx7For7+3Te+/DgzvlY4iD8hbKY/bozKPsYs5j4WKZu+suBQvUfWBD9MY0w/S4CQPo1Ug7/AAOq9UjGRPqjt9T0F2ju/c90UPzom276tGQK/nm1lPN5vNj5mhjq/+OWjvuklnL6Z/1s9SFBzvpuPrjyMWT8/mZojv0aiub2enYC/fA+aPj6zN79+exG/7xoyv/47dL8w1dO7OFdiPijkaz6btvI+x6g6vwI7UD92dFk/nYF9vj9s/j5DfPG+5NsTPh2dIb/FWw+/IvsZPYIi3z/NCdy+PDCsP3N79z5fFsI/1CQiv4r/pL5pZfy+29OEvuYeib6YnZi+Zb7yPgok/L7c4UW+bT0GP92cjr78Syk/yJsFv91RzL6Auay+bnanP0r+Er85Vp6+/ZsWPjmzi799EX8/dm8mPi2IwT50Xtk+TxWgPp5/RD6pqU+/voSUvY4FQz9IUA+/LCBqvKRZy7/wcgm6UcRpv2xXXjzyo627JkzaPTlxMD8Wi5C/qnU6P+U0Fb9ZXKY9GlokP4HuqL6XKki9uugavxb3Ij9vEB2/CkHaPjrEsz5WIyS/BNAFPzFZST9lpqs+zkbSvT4mPb8jASM+nM5vPrH0Fz41OFa/zKiaPth5uj7U1T6+uAX0vU0GgD+1/S2+Np8LP9zwBb+AKIW/uPcPPv/fsr5Msyy/V+TVvuW+N7/Xy/c9NtA8P9SZgb8T1pM+ziPYvll0ub5P12o+tv8Dv/pFDr6Q/yU+hzZPvuSshD6BlkA/B8tnPsoloTyAbBM/S3iqPiN0ML+CgCQ+uMTsvoJ8AL/XL5W+gVwaP89lxTwsPZi+E6vgvpQ//T4wMYW9b5O8PpK0Fb90wo4/e/Civz6AND9Is2C/+z88PvBn3z5VTkC/lZ6Kvxbggz/E/bC9PngFPrzZnz44YPY+LnYiPvUBO79DUTQ+6DQhv5gjoz6rshm/NeSCvpw9GL5Lin2/PJvcPA6rlL7ePYQ9ORSmve8mKz5yjNy+FS82v9PsAz5QOEO/DFvtvkCAaj/+6B09xpEeveuC6L0v8Bc/i9HPPc3H1L7QvSq+DkrWvio3/T3+FsQ9v4LyPmyRrb78SiY9LsKZPe77qry2Hb08yhwlP8UurL4gNxI/CQcHP9MxEb9qjjs/AgQBvzJDC74QtxI/gULBvvrqgj/G9X8947kGvvLFcz8Ehxc+gE+3voApij6oldo+Rg2YveAWGz85nAm+AgUNvQ6aAj8AR629TbMPPmNntL6s9Ai8DnXbPgJ/pr9sgYs9Pz1kv1PHmz9i9lo+38QnvxusUL48gZm9xUi5vnyngb5RLRU/im6MPra1Sr5s2CE/tkzOvgrwmj3ETz2+kFP4vilgJL82uya/oo81vwJBHL6dcQU+fJ1mv8L1bL3exdO+a/iivuIjqD0c8h+/UCDevhJZ9r7Ydss+WbcdPw2yQz/ot+292uLwvRF5CL+KL7K/FtbHPjqCyb5e/MU+841Hvw1+bb5CEGG+RdtTP2VRij7TYwc/n053vth8ez7x8oe+BM/EPr2NUL9cIqs+CHQevzDYUDyt09k+VdEVv2gUmj4Tt0E77s4Zv7YcdL/yRho/Uoq9PhwEkz4ypv++WDSMvCpsAT+7dj8/NgpNvyMUCb5o+5i+kLHKPrqDdT2V34W+mHbJvpxPZz4ObKc+z6HiPhXCBD8kNOu+86hFP6hZR79ptfC8he+PvyBklD+Y8ri+Gve3vviL0D70nUS+DopaPtVcWz52EEi+2v3kPs/UO78uPis/T70oPy7Xij+uAbG9WJiOv42lv77X8Mu/fzCpvX1SAT/f4pK9CvYJvin50b1SJDM/zO4AP/YswD3kwe6+Lp6jPkr7nr58iK09H9NPvrd1BL541Ac/Eii2O9Z7Kr+isEm/uHKoPoUVcL5WKTg/ZCw4v/zIoL6Lv0g/zO0nPsQhWb6YLwK/VEvAPnDSaT/f6Yk+jm5fvbauoT6f/9e+aZoDvvcWjb/Bz+s+Ww0bv4AFNj2O1He+XLMLv9yh2r2eDcI/kTu2vXjwAT/DSRG/sASgPyyHJj7ogy++DM+Bvwqx7D5t/qw9JSeEvwQWND8PiQI/EW5rvWbXLr/M5oc+jik3PR47gL29XHI+ayyJv3u6x71MRsQ+2LKQP9HIRL+sGag+3nxTPthR+j1/R6c+0v/uPGnpPz1lvgC/wdYBPwdgGj8+AHw/ptyLu4zf3j6/3DK/UCwyvsc1RD/G1uk9xSixvb2byz5wkXA/ymt6vyV0Lb9MA0c9fnh+PnryKL5+kjO/DtPmvdrTyb6eP/W+6vNqPx7OHj7ElJ28pgdlvp6UjL89DDE/OARIvWRb7T7MroU+LZnuPWIvWL/Drue+GH09vjTQ477R5D+/WS0pvkRVM7/761e+KB2bP6sXl76snew9iKtzP9EtjL9OXE6/x/ZAPiMGDL+076k9W6kDP5E2oj9gXO4+GG+ev9IUV77jGde+Z5qePrsEiz7Q2YY+4EAfPoP2jb3yjiq/UcswvusyOD/NEbC+cJuJP/z9PT0t4Vc+S7BjPGe3hD4xqwQ9ZQxavdUKgr1FmZi/8Qh6P2jtUL8iZa6+ChaMve35yr4R9CW/jCXmvbTwOj+cayS/HTywvulvBT28m76+FP/5vlto475bsSO/JFVyveC0kT7WXtI+dHN4P49xi72yBt299qcrv8f4jr6Ttiq+McRKPuRJir/kw34+1nUCP8eOiD4K/5k+8l1/P4OhZL53PnQ/SrnSvSXXST4dbNu+5fDCvijFBb/baI8/WRO1vsb7GT5/Y2M/qnoev7yMEr/O3yS9xJ7GPaz7TT0Vofu9p6vpPvnjFL6Ytzo7UrCFPfh0HT1qdSQ/FrhaP4uTML7t8J8/AYe8vj2WcD5iAYu+BZM1PuPaMj+ucjg/johHvU+D8z54P8E77eTsvglMX74ST7Y8kfTTvK3NNr8SEjo/aNnVOvNfID8aps08sWYJP0K5lb6Nnag+F8KDvpIPNL6guze/Ai65P/zfvj6NaDe/YgdyPtgTjL8/RWE/7/XlPoBNwD2TW4W/gvyav6yYWL61/Ye+4d0dPl5uzrwtmw+/rEcXvQtzGr+126I+4RFZv4JzHb/s2Fo/SGpJPrK03r45E+8+kelzvS+XcL/mPwA+8cbKP4W+l77LYhM/MOEMvyFbiD+Gq+e7TJ2OvuPsIL72puo+9yFivLELzr5FnDy/1vx2v9hXJb4Argi+sSS4PaKxeT08XQk+mHpWvys2xT1uBEG/UJxevm9FiT5WrwS/PkQTP6tmPj+xKzq/J46Qviqjb78PUYa+DgsnvlqeUr+UbjS/AXqPPuDGnz6saKM+qnUDPvXgY7+LnI0+V7n0PoE6GTy3KUY+9hhhPwqjVT6dbho/HIMRPhX0BL9H59S+Y+DXvttGdr+3fXq/OiwTvfVvlL/y2p89r2EFv7eNDL8dkRQ+
1,4598,KCUXvgh1Kj/pCf4+058SPs3OKj95WD2/t5dfPw6UA79HTPE+0ueRPiiRN7/AI5M+ciU4vReYmD7oeaW/NHUIP/9Gw75CDTa+mPXWPqmzTL7qnai+efsQv1OzFb9mxOU+tnOIP/QcHT/b2dQ+gZc7vxUq27/j/3w+ZQY4vyw74D5BLq4+XNAav9xr777Ad6E/43SwPrBDjL0Vsp8+Pkr4PbqWVT57g9m+lLIZP/oChr3PuF6/RtqFvnJ2mzzpEuE+TEDpPtBtAr+MF/u+qIBBPx/tgD8RCDO+qHYGPuqUFL+M9BA/oS/Fv3PwCr67it++VpOrv8rzJT7Jdwo/DA6FP8w9Nr6hMWU+Pxeiu602ND57Fce/5h9iv19Aoz47TBm/stAvv5yc8T0AtDk+4drXvdMTNL9ar/g+sq1UPwcsqD5XngA+YqpUPtvs4T5vlkS/HZJ2PyvrwD6P2sA/neoVP5xKkr9MbEA/9/dVP/aDQz/7EeM+6Tm/vtH+8r7IRHO+OnYLPmmcsT5tVyk+gOojP3+OM78qH/M+E8MUvlJpgT4dsy6/szxyvpUDYT2OPgi/iHxzvzakJT9Y8F8/eqyIP0Ff1L2MZC+9tjiYvu+dZ7/w5oW/SAVdP/MwDj/tk+Q+HWEvP8VDAb02yG++8QDGPq7ihj9kFVo/Oe3gPV55RD/f+ES+8RCPPqo53T6BZwc/gHmjPmUM/r7QY6W+6Eb8Puo+uzxz/8C+TCUUP7DXTL+GNmG/YpNtPSkWYD4zP+O+pOmXviOSWD3Bhbg+RYqIvY8Unj37bhI/Y89rvpwiMT4BvzK/ynKGPiqOOr+sJdS+xZrIvvaoiL5P8Lu+VB79Pn6Ghb2JHyI/0DzRvT6V1T7I7kk/0D8Yv6wstD3tQLS9a1VMPuR+b79m4oG/MgavvYnWvz+kmSO/b1ZuPxPWDj7brzA/m2jZvheE0jz29yW+VGyvvmwlLD/iVR2/ymGoPgxvBb9HSC0+upb6PaGTvr5TKOg+zh8Gv0Lq9773L12+xhCcPlfcW7+kF1W/W8G+PpRCmb9YQsI/+zyRugzrZD5Ccwg//fezPh/gVr4IQ9S+HhwUPp+ICz8HYiG+NJW9PhBJmr/7BFu+VWuRv9tSLj9iXO87Q6FMP0+Bvj49Tje/9+PJPuI7iL+imoI/5Z/6PuIln74j+Ve+HtkqvwE7qD6z3Vy/qoqPP59kFj9wS3y/eosVPpTcQj/xxqg+nxWJvno2PL+Jj8y9o1c3PvgHEj8aRki/Vw+VPiY8Cj807bE+7FwLPrZHXT8h7l4/xzVRP00UIr+QSPG+b+UcPqKX670o7lW/+eYWPYb1Hb830VY+g9yEP19qbb+am9A+USxpvQwOSb/3jYM+6g2bvvI4xD67LYQ+X/5vPiFfJb8BOio/5CtrvTrSCb8sVbI+SZJxP4I5DL/Makk9lGsmvz6mer9PyoW+gwrDPiPQJL/9T2m+95YqvxC3oz4apCq6laFtPUUL4r45iGk/WvO3vxXXKD7cMii/I4sHP4akMT4meFy/j2+xvmTXGT8e06C9cOkwvomwpTxVZD4+zvYQP/ZmCr9TAdM+OSVdvxUMpDzongC/gr5dvoOeMj7tdIG/dk23Pp5xeb69xg69KG/RPhYNRz5qHAe/fm1dvzdAf77JPL2+8os5Pu2Aej8xmkM+w0Gfvp4vCL9tAoE/4g52Pr1mS7+Bscg9tvh7POgtkz7Qe2u+OLBHPz5I9b3G1qc+fumCPpUc2r423bu93WhpP9djRr9rDX4+0fSrPcUZET8qjA4/OOVwv583Ab8vO5c+du8Tvxxfrz/sOcI9nhxBvptSGT8iebQ+DMAvvlyQdD6BVq2992I/vZA+0L7wsbU+ZidxvoAynz24jBM/FndtPr6Dk76wy42+s4I1P/l7Pb82tbu9hgWevn+LMT8NuhG/06Iuv+W5v72QM5O+mFwHvwMa8j4FHog//8TzvUzpVz6Cg449FT8yviwZpL57tT2/RaiJvvoYob4Q2LO+3zIhv+MEXb8wEH++Qb8bvzjArT6hDvq97J6oPZXVT75BFp09PqsXvtX08z1hcvu9AI8tP7r3sz6alBg+drulPg8dvj0PR4q/KLkNPZdCJT0SlJI+qfIEPk01JL4KGqS9+zDxPl8HLj7fbJk+V6mWvBY67D54M4y+6Nhjvo74rr5pdwk+3Q9FvwkLWL6No288Vtj1vt08Zj4qdDq+LDxevyTDyr5IjYs/FDBrPiFrHr8TJnU6vZBZPiLAbT638zQ/BSeSv1utqr6XRA6+TuGCPjTaHr+Tvq+9SRzRvgVvX77f5fA+XtrNPn6bKj95rjG/m8APP7w3g7/Xg0I+lQB+v1wPXz+667a900kDvzeM2j4MkEq/xSvLvok6KT+NiEu+xuTAPrpnrr4erYA9azz5PpUkij/mgfW+4ndDv+mmIr/R8kK/GP+xvltEgT/ajGg+wVMxvz6mgL3XQoU+LWp3Pvi9hj7qEni/6QMnu1T2Y77mTB++vzMuvl9OFz+7jx4/8t9cPWipAr/j7YS/sAjLvBG/Br93SNc+toEDv9Ee6D4Xx4A/GnZdPwwxh7vkGna/G4R9Pp5ezT6Ywwc/yhwEPv3cEz54zaa+VkyOvqX5dL+y0ak9ImuNvgH7Qb802T+88sWUvE+J8bxl/dE/zhLOPs/9oj1sdHi+w4KGP7Qvez9pX8M+DC8qvis3Ez/7e8M9yDESvaV5K73qBr4+PpHbvn+EQr+A5iE/QnKGvHi4lb3qlLQ+1DQpvzDxub7xQmo+M5jEP7blnb9ILLC8UypvvgFTPbzwVhE/ZvEpP09L0b29ZrG7vVSxPluoPD4rPQI/ySC6veF60b3Up9S+WMYzPYEaHz/AlUy+B+tpvqwcjD53W1Q/+rZ+v7tW/76o8pE+0HmHvqEgMr8zLMC+D21avtrTm77MnNu+nx8eP5+vx76rUJS+OIeSPYlSDb87Q2Q/H+4gvxz+Qj/+zWU+YQCQPk0iJb4xavu+N6Inv5uKZb/QRle/FVanvsp44r4/jVK+msplP57egb74USQ/bTxRP18GT7/+Jv+91Y4uPs9yb7/J1iK+3QNxvSj3FD+T70c/JJAIv20RG7/+Q+O+9ADUvujkHz/KseY85WSDvnux4b6ai7W+F5cxvkm3ND5/gao9gxU4PzhJhL5m5N49l11XPzlm2D1FO9o94550PpWDpb46jrm/q1cvP3mdl7/fQ7++j8zlvqj9Cr4v/Ai/YTQtvpAaLT+QbWC/B0zyvr4ixr19n9C+xorXvoPWsb7Xpk8+l/zOPipxlz1nyAs/peqIPrElD79kYa6+AXLEPhp4Jb8mDlw+L5IJPt9oKb/bV7k9A3KfPlTuOD3brx4+jJ15P1zXoL5GeOk+qSMVPhrJlb6FwqW+L/ruPBEaqL7FI3Y/t8YUv2FUob74OsU+bCyFvQ7DYT6bUQC8kBzJPtFnRD+JtgA/xR8cPp5oYL2ya5c9xwurPRvLOz2o5hg/334rP44Fr74hcQY/N8v7PhWYpb3G1Tg++y0aPmpUAD+JKCA/aFoYP6jB9T33AHu+TtdyPe+bA7+0tbO9Px8MP8GbtL9jgY4/JiumPomC3z7GQ5s+paaoPpisSr9r9wg+kXm9vqjRRr6C13W/KiuwP+hDlT4vsGS/aqGVPnvksb6m5u4+xeOIPr7/BT4ki0S/KJ+Sv7nLPr7uzAW/78WTPVEdMz6dIfa+nmDrOlsdlr9AHeE+cQoOv8LdNj7pIbE+Iaaavm/3Kr82Yj+9w2MKvSyjBz4G44W9DZaTP8p/Ub5t6Cg+fjgFv2sNiD+s/YO+G9HmPcCUcj57Oe8+tNeivWmclr6lA4S/ALakv4nzB77MNw2+AhfbPbd7aT46VC+/nIo/vvHXGT/pNHe/DoO0vtBHhz7VEFk+QQNeP6i6HDwr4Bq/sSUhv4Uegb+XqmM8D1UPP4p0Hb9UFPS+qDGCPjpm5z7sCwk/3r8ZPmxqH78hUXA/MikCPylxkb1KGrs+v8gLPwA6PLzybF283lFjvSX/Jb8+A8e98uwxvuRfaL+2j0C/O/ccvj/rS7+sxg6//puTvoiaEb/RDgU+


In [38]:
# TR_MAX_SIMILAR = df_trans.shape[0]-1
# print(TR_MAX_SIMILAR)
# df_trans_res = find_similar(TR_model,df_trans,TR_MAX_SIMILAR,TR_DATA_DIR,TR_OUTPUT_DATA_FILE)

# Model comparison - consolidated score

In [39]:
model_comparison_lst = []

In [40]:
VECTOR_SIZE = 300
dic1 = find_sim_score(df_word2vec,'W2V_fasttext')
print(dic1)
dic2 = find_sim_score(df_word2vec,'W2V_google')
print(dic2)
dic3 = find_sim_score(df_word2vec,'W2V_glove')
print(dic3)
dic4 = find_sim_score(df_tfidf,'tfidf_vector')
print(dic4)
VECTOR_SIZE=TR_VECTOR_SIZE
dic5 = find_sim_score(df,'universal_encoder',combined_IssueText_embeddings)
print(dic5)
dic6 = find_sim_score(df_trans,'Transformer_vector')
print(dic6)

{'W2V_fasttext': {7134: 0.33128, 4598: 0.28372, 4596: 0.99999, 4462: 0.3464, 4394: 0.37923, 19100: 1.0, 19101: 0.28302, 19102: 0.01845, 19103: 0.46001}}
{'W2V_google': {7134: 0.04432, 4598: 0.06809, 4596: 0.99998, 4462: 0.05484, 4394: 0.23169, 19100: 1.0, 19101: 0.06911, 19102: 0.04996, 19103: 0.15608}}
{'W2V_glove': {7134: 0.20532, 4598: 0.00264, 4596: 0.99997, 4462: 0.13661, 4394: 0.21942, 19100: 1.0, 19101: 0.00177, 19102: -0.04559, 19103: 0.24931}}
{'tfidf_vector': {7134: 0.04094, 4598: 0.23668, 4596: 0.05701, 4462: 0.17166, 4394: 0.12458, 19100: 0.96352, 19101: 0.54862, 19102: 0.47077, 19103: 0.32758}}
{'universal_encoder': {7134: 0.37555, 4598: 0.65752, 4596: 0.4021, 4462: 0.58658, 4394: 0.55063, 19100: 0.97752, 19101: 0.59721, 19102: 0.6051, 19103: 0.57482}}
{'Transformer_vector': {7134: 0.81352, 4598: 0.8444, 4596: 0.88083, 4462: 0.89869, 4394: 0.84949, 19100: 0.99237, 19101: 0.92586, 19102: 0.95526, 19103: 0.90458}}


In [41]:
model_comparison_lst.extend([dic1,dic2,dic3,dic4,dic5,dic6])

In [42]:
df_consolidated ,df_final_size = create_consolidated_df(model_comparison_lst)
df_consolidated.head(6)

Unnamed: 0,index,7134,4598,4596,4462,4394,19100,19101,19102,19103
0,W2V_fasttext,0.33128,0.28372,0.99999,0.3464,0.37923,1.0,0.28302,0.01845,0.46001
0,W2V_google,0.04432,0.06809,0.99998,0.05484,0.23169,1.0,0.06911,0.04996,0.15608
0,W2V_glove,0.20532,0.00264,0.99997,0.13661,0.21942,1.0,0.00177,-0.04559,0.24931
0,tfidf_vector,0.04094,0.23668,0.05701,0.17166,0.12458,0.96352,0.54862,0.47077,0.32758
0,universal_encoder,0.37555,0.65752,0.4021,0.58658,0.55063,0.97752,0.59721,0.6051,0.57482
0,Transformer_vector,0.81352,0.8444,0.88083,0.89869,0.84949,0.99237,0.92586,0.95526,0.90458


In [43]:
df_consolidated.to_csv('../data/model_perf/4555_model_consolidated_results.csv')

# Consolidated  summary of Comparison of similarity scores

In [44]:
df_final = create_final_sorted_consolidated_df(df_final_size,model_comparison_lst)
df_final.head(6)

['W2V_fasttext', 'W2V_google', 'W2V_glove', 'tfidf_vector', 'universal_encoder', 'Transformer_vector']
OrderedDict([(19100, 1.0), (4596, 0.99999), (19103, 0.46001), (4394, 0.37923), (4462, 0.3464), (7134, 0.33128), (4598, 0.28372), (19101, 0.28302), (19102, 0.01845)])
OrderedDict([(19100, 1.0), (4596, 0.99998), (4394, 0.23169), (19103, 0.15608), (19101, 0.06911), (4598, 0.06809), (4462, 0.05484), (19102, 0.04996), (7134, 0.04432)])
OrderedDict([(19100, 1.0), (4596, 0.99997), (19103, 0.24931), (4394, 0.21942), (7134, 0.20532), (4462, 0.13661), (4598, 0.00264), (19101, 0.00177), (19102, -0.04559)])
OrderedDict([(19100, 0.96352), (19101, 0.54862), (19102, 0.47077), (19103, 0.32758), (4598, 0.23668), (4462, 0.17166), (4394, 0.12458), (4596, 0.05701), (7134, 0.04094)])
OrderedDict([(19100, 0.97752), (4598, 0.65752), (19102, 0.6051), (19101, 0.59721), (4462, 0.58658), (19103, 0.57482), (4394, 0.55063), (4596, 0.4021), (7134, 0.37555)])
OrderedDict([(19100, 0.99237), (19102, 0.95526), (19101,

Unnamed: 0,models,defectnum_0,defect_score_0,defectnum_1,defect_score_1,defectnum_2,defect_score_2,defectnum_3,defect_score_3,defectnum_4,defect_score_4,defectnum_5,defect_score_5,defectnum_6,defect_score_6,defectnum_7,defect_score_7,defectnum_8,defect_score_8
0,W2V_fasttext,19100,1.0,4596,0.99999,19103,0.46001,4394,0.37923,4462,0.3464,7134,0.33128,4598,0.28372,19101,0.28302,19102,0.01845
1,W2V_google,19100,1.0,4596,0.99998,4394,0.23169,19103,0.15608,19101,0.06911,4598,0.06809,4462,0.05484,19102,0.04996,7134,0.04432
2,W2V_glove,19100,1.0,4596,0.99997,19103,0.24931,4394,0.21942,7134,0.20532,4462,0.13661,4598,0.00264,19101,0.00177,19102,-0.04559
3,tfidf_vector,19100,0.96352,19101,0.54862,19102,0.47077,19103,0.32758,4598,0.23668,4462,0.17166,4394,0.12458,4596,0.05701,7134,0.04094
4,universal_encoder,19100,0.97752,4598,0.65752,19102,0.6051,19101,0.59721,4462,0.58658,19103,0.57482,4394,0.55063,4596,0.4021,7134,0.37555
5,Transformer_vector,19100,0.99237,19102,0.95526,19101,0.92586,19103,0.90458,4462,0.89869,4596,0.88083,4394,0.84949,4598,0.8444,7134,0.81352


In [45]:
df_final.to_csv('../data/model_perf/4555_model_sorted_consolidated_results.csv')