In [1]:
import numpy as np
import pandas as pd
import progressbar as pb
import time
import warnings
import text_processing as tp
warnings.filterwarnings('ignore')

In [2]:
# # Import dataset
# data = pd.read_csv('https://zenodo.org/record/400614/files/apache.csv?download=1',sep=',')
# # Remove nan from the main dataset
# remove_nan(data)

In [3]:
# # Preprocessing the short_desc
# processed_data_df = tp.text_preprocessing(data,"short_desc")
# # Save to file to save time
# processed_data_df.to_csv('~/Desktop/Google-Drive/Colab Notebooks/processed_data_df.csv',index=False)

In [4]:
# Load prosecced df from file to sasve time
processed_data_df = pd.read_csv('~/Desktop/Google-Drive/Colab Notebooks/processed_data_df.csv',sep=',')

In [5]:
# Model-1: Similarity Score - Word2vec -------------------------------------------------------------------------
import pandas as pd
import spacy
import time
import progressbar as pb

nlp = spacy.load('en_core_web_lg')
execution_count_word2vec = 0
processed_data_nlp_df = []

# Convert short_desc str to nlp format to fasten the computation process 
def word2vec_preprocess(df):
    print('Convert short_desc str to nlp format')
    sample_size = len(df)
    progress = pb.ProgressBar(maxval = sample_size).start()
    progvar  = 0
    processed_data_nlp = []
    for tup in df.itertuples():
        processed_data_nlp.append((tup.id,tup.product,nlp(tup.short_desc_processed))) 
        progress.update(progvar + 1)
        progvar += 1
    global processed_data_nlp_df
    processed_data_nlp_df = pd.DataFrame(processed_data_nlp, columns=['id','product','short_desc_processed'])
    global execution_count_word2vec
    execution_count_word2vec += 1

# Calculate the cosine similarity score
def word2vec_similarity(id,df):
    if execution_count_word2vec == 0:
        word2vec_preprocess(df)
    similarities_score_list = []
    product_main = processed_data_nlp_df.loc[lambda df: df['id'] == id,'product'].array[0]
    short_desc_processed_main = processed_data_nlp_df.loc[lambda df: df['id'] == id,'short_desc_processed'].array[0]
    for doc in processed_data_nlp_df.itertuples():
        product_other = processed_data_nlp_df.loc[lambda df: df['id'] == doc.id,'product'].array[0]
        if product_main == product_other:
            similarity_score = doc.short_desc_processed.similarity(short_desc_processed_main)
            similarities_score_list.append((doc.id,similarity_score))
    #convert to dataframe
    word2vec_similarities_score_df = pd.DataFrame(similarities_score_list, columns=['id','word2vec_score'] )
    word2vec_similarities_score_df = word2vec_similarities_score_df.reset_index(drop=True)
    return word2vec_similarities_score_df

In [6]:
# Model-2: Similarity Score - TF-idf ----------------------------------------------------------------------------
import pandas as pd
import time
import progressbar as pb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

tfidf_cosine_similarities = []
execution_count_tfidf = 0

def tfidf_preprocess(df):
    X_train = df['short_desc_processed']
    print('TF-idf Vectorization and similarity score computation')
    # Vectorization
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(X_train)
    # Calculate the cosine similarity score
    global tfidf_cosine_similarities
    tfidf_cosine_similarities = linear_kernel(tfidf)
    global shape_tfidf
    shape_tfidf = tfidf_cosine_similarities.shape[0]
    print('TF-idf preprocess done')
    global execution_count_tfidf
    execution_count_tfidf += 1

def tfidf_similarities(id,df):
    if execution_count_tfidf == 0:
        tfidf_preprocess(df)
    index_main   = df.loc[lambda df: df['id'] == id].index.array[0]
    product_main = df.loc[lambda df: df['id'] == id,'product'].array[0]
    tfidf_cosine_similarities_list = []
    for index_other in range(shape_tfidf):
        id_other      = df.iloc[index_other]['id']
        product_other = df.iloc[index_other]['product']
        if product_main == product_other:
            tfidf_cosine_similarities_list.append([id_other,tfidf_cosine_similarities[index_main,index_other]])
    #Conver to dataframe
    tfidf_cosine_similarities_score_df = pd.DataFrame(tfidf_cosine_similarities_list, columns=['id','tfidf_score'])
    tfidf_cosine_similarities_score_df = tfidf_cosine_similarities_score_df.reset_index(drop=True)
    return tfidf_cosine_similarities_score_df

In [7]:
# Model-3: Similarity Score - BM24F -----------------------------------------------------------------------------
import pandas as pd
import time
import progressbar as pb
from rank_bm25 import BM25Okapi

processed_corpus_list = []
bm25 = []
execution_count_bm25 = 0

# preprocess - tokenize the short_desc to token
def bm24_preprocess(df):
    print('preprocess - tokenize the short_desc to token')
    sample_size = len(df)
    global processed_corpus_list
    processed_corpus_list = []
    for x in df.itertuples():
        short_desc_splited = x.short_desc_processed.split(" ")
        processed_corpus_list.append(short_desc_splited)
    # Create a MB24 Object with the corpus
    global bm25
    bm25 = BM25Okapi(processed_corpus_list)
    global execution_count_bm25
    execution_count_bm25 += 1
    
# Calculate the similarity score
def bm24_similarity(id,df):
    if execution_count_bm25 == 0:
        bm24_preprocess(df)
    index_main    = df.loc[lambda df: df['id'] == id].index.array[0]
    product_main  = df.loc[lambda df: df['id'] == id,'product'].array[0]
    query         = processed_corpus_list[index_main]
    doc_scores    = bm25.get_scores(query)
    doc_scores_df = pd.DataFrame(doc_scores, columns=['bm24_score'])
    # add id to the score list and remove unsimiliar product
    blanks = []
    for x in doc_scores_df.itertuples():
        id_other      = df.iloc[x.Index]['id']
        product_other = df.iloc[x.Index]['product']
        # add id to the score list
        doc_scores_df.loc[x.Index,'id'] = id_other
        if product_main != product_other:
            blanks.append(x.Index)    
    doc_scores_df.drop(blanks,inplace=True)   
    doc_scores_df = doc_scores_df.reset_index(drop = True)  
    return doc_scores_df

Assessment the Model Accuracy

In [8]:
# Assessment the Model Accuracy -------------------------------------------------------------------------------
# duplicate_df = pd.read_csv('https://zenodo.org/record/400614/files/apache.relations.csv?download=1',sep=',')
duplicate_df = pd.read_csv('~/Desktop/Google-Drive/Colab Notebooks/duplicate_df.csv',sep=',')

In [9]:
# Calculate the similarity scores and return the first n top scoes
def similarity_score(id, df,top_n):
    word2vec_similarity_df = word2vec_similarity(id, df).sort_values(by=['word2vec_score'],ascending=False).head(top_n)
    tfidf_similarity_df    = tfidf_similarities(id, df).sort_values(by=['tfidf_score'],ascending=False).head(top_n)
    bm24_similarity_df     = bm24_similarity(id, df).sort_values(by=['bm24_score'],ascending=False).head(top_n)    
    return word2vec_similarity_df, tfidf_similarity_df, bm24_similarity_df

In [10]:
# Calculate the similarity score for the reported duplicated bug report
import progressbar as pb
import time
start_time  = time.time()
sample_size = len(duplicate_df)
progress    = pb.ProgressBar(maxval = sample_size).start()
progvar     = 0
duplicated_similarity_score_list = []

for tup in duplicate_df.itertuples():
#     word2vec_similarity_df, tfidf_similarity_df, bm24_similarity_df = similarity_score(tup.id, processed_data_df,20)
#     duplicated_similarity_score_list.append([tup.id,tup.dup,word2vec_similarity_df,tfidf_similarity_df,bm24_similarity_df])
    progress.update(progvar + 1)
    progvar += 1
    
print("-Total- %s seconds ---" % (time.time() - start_time))

N/A% (0 of 500) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--

-Total- 0.009871721267700195 seconds ---


In [11]:
import pickle
# Save the list to a file since it takes 8 hours to create it
with open("duplicated_similarity_score_list.txt", "wb") as fp:   #Pickling
    pickle.dump(duplicated_similarity_score_list, fp)

# Read the list from a file 
with open("duplicated_similarity_score_list.txt", "rb") as fp:   # Unpickling
    duplicated_similarity_score_list = pickle.load(fp)

In [12]:
# Calculate the numbe of found duplicated report in the a algorithem
def result(name_of_algorthem):
    if name_of_algorthem == 'word2vec':
        select_algorithm = 2
        print('word2vec result:')
    elif name_of_algorthem == 'tfidf':
        select_algorithm = 3
        print('TF-idf result:')
    elif name_of_algorthem == 'bm24':
        select_algorithm = 4
        print('bm24 result:')
    else:
        return "Wrong selection"
    found_counter = 0
    not_found_counter = 0
    for x in duplicated_similarity_score_list:
        dup = x[1]
        df = x[select_algorithm]
        if df.loc[df['id'] == dup].empty:
            not_found_counter +=1
        else:
            found_counter +=1
    print('Num of duplicated report found: ',found_counter)
    print('Num of duplicated report not found: ',not_found_counter)
    print('Recall (TP/TP+FN): ', found_counter/500*100,'%\n')

In [13]:
# Check the results
result('word2vec')
result('tfidf')
result('bm24')

word2vec result:
Num of duplicated report found:  150
Num of duplicated report not found:  350
Recall (TP/TP+FN):  30.0 %

TF-idf result:
Num of duplicated report found:  178
Num of duplicated report not found:  322
Recall (TP/TP+FN):  35.6 %

bm24 result:
Num of duplicated report found:  174
Num of duplicated report not found:  326
Recall (TP/TP+FN):  34.8 %

