<a href="https://colab.research.google.com/github/ghasemieh/Duplicate-Bug-Identification-System/blob/master/Model_1_V05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Model-1-V5
# - Load data set
# - Remove NaN in short_desc
# - Convert to lowercase
# - Split the words using 1. ASCII character identification for english 2. Split by Space  3. wordninja
# - Apply normlise
# - Apply contractions/expansions 
# - Remove punctuations
# - remove tags.
# - remove special characters and digits.
# - Stemming/Lemmatisation. 
# - Prepare the whole data set
# - Calculate the similarity between every two bugreport for the same product

In [0]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [0]:
data_org = pd.read_csv('https://zenodo.org/record/400614/files/apache.csv?download=1',sep=',')
data = data_org[['id','product','short_desc']]

In [0]:
def remove_nan(df):
    blanks = []
    print("Before removing the NaN:")
    print(df.isnull().sum())
    for x in df.itertuples():
        if type(x.short_desc)!=str: # detect the NaN
            blanks.append(x.Index)
        elif not x.short_desc: # detect empty string
            blanks.append(x.Index)
    df.drop(blanks,inplace=True)
    print("\nAfter removing the NaN:\n",df.isnull().sum())

In [0]:
remove_nan(data)

Before removing the NaN:
id             0
product        0
short_desc    58
dtype: int64

After removing the NaN:
 id            0
product       0
short_desc    0
dtype: int64


In [0]:
# conda install -c conda-forge spacy
# !pip install wordninja
# !pip install normalise
# !pip install pycontractions
# conda install -c conda-forge spacy-lookups-data
# !python -m spacy download en_core_web_lg

In [0]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_lg')
import nltk
# nltk.download('brown')
# nltk.download('names')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('universal_tagset')
import wordninja as nj #for spliting the words in each documents
import normalise as ns 
from pycontractions import Contractions # for expansion and contrations
import contractions
import re # remove tags.

def preprocessing(df,id):
    # Extract and convert short_desc to string
    row = df[df['id'] == int(id)]                                # Extract a tuple from the dataframe
    short_desc_to_string = row.short_desc.to_string(index=False) # Conver short_desc to string
    short_desc_to_string = short_desc_to_string[1:]              # Remove the first space char from the begnning
    
    # Convert to lowercase
    short_desc_lowercase = short_desc_to_string.lower()
    
    # Split the words using 1. ASCII character identification for english 2. Split by Space  3. wordninja
    short_desc_splited = nj.split(short_desc_lowercase)
    
    # Apply normlise
    short_desc_normalised = ns.normalise(short_desc_splited,verbose=False)
    short_desc_normalised_listToStr = ' '.join(map(str, short_desc_normalised))
    
    # Apply contractions/expansions 
    short_desc_contract = contractions.fix(short_desc_normalised_listToStr) 
        
    # remove tags.
    clean = re.compile('<.*?>')
    short_desc_removed_tag = re.sub(clean, '', str(short_desc_contract)) 
    
    # remove special characters and digits.
    short_desc_removed_special_char = [re.sub(r"[^a-zA-Z]+", ' ', k) for k in str(short_desc_removed_tag).split("\n")]

    # Remove Punctuations and Stop words
    short_desc_doc = nlp(str(short_desc_removed_special_char))  
    short_desc_map = map(lambda token: token if (token.is_punct == False and token.is_stop == False) else None, short_desc_doc)
    short_desc_list = list(short_desc_map)        
    
    # Convert list to string and remove one-character word
    short_desc_string = ""     
    for element in short_desc_list:
        if element != None and len(element)>1:
            short_desc_string += str(element) +' '
    short_desc_string = short_desc_string[:-1]
            
    # Stemming/Lemmatisation.       
    short_desc_lemmata = [token.lemma_ for token in nlp(short_desc_string)]
    short_desc_preprocessed = ' '.join(map(str, short_desc_lemmata))
        
    return short_desc_preprocessed

In [0]:
# Prepare the vectors for each document
import time
import progressbar as pb
start_time = time.time()

# sample_size = 100 # Sample Size
# progress = pb.ProgressBar(maxval = sample_size).start()

dataset_length = len(data)
progress = pb.ProgressBar(maxval = dataset_length).start()

progvar = 0
processed_data_list = []

for x in data.itertuples():
    string = preprocessing(data,x.id)
    processed_data_list.append((x.id,x.product,string))   
    
    # Show the progress in the output
    progress.update(progvar+1)  
    progvar += 1    
    
    # Terminate the process when reach to sample size
#     if progvar >= sample_size:
#         break
# Convert list to dataframe
processed_data_df = pd.DataFrame(processed_data_list, columns=['id','product','short_desc'])
del processed_data_list

# show the time of process        
print("--- %s seconds ---" % (time.time() - start_time))
processed_data_df.to_csv('~/Desktop/Google-Drive/Colab Notebooks/processed_data_df.csv',index=False)
processed_data_df.head()

  5% (2212 of 43991) |                   | Elapsed Time: 0:02:38 ETA:   0:40:10

In [0]:
def index_to_id(df):
    length = len(df)
    for tup in df.itertuples():
        df.loc[tup.Index,'id1'] = data_org.iloc[tup.id1]['id']
        df.loc[tup.Index,'id2'] = data_org.iloc[tup.id2]['id']

In [0]:
def remove_diff_product_score(df):
    for tup in df.itertuples():
        product1 = data_org.loc[lambda df: df['id'] == tup.id1,'product'].array[0]
        product2 = data_org.loc[lambda df: df['id'] == tup.id2,'product'].array[0]
        if product1 != product2:
            df.drop([tup.Index],inplace=True)

Model-1: Similarity Score - Word2vec

In [0]:
# Calculate similarity score Using word2vec
start_time = time.time()

# Convert short_desc str to nlp format to fasten the computation process
processed_data_nlp = []

for tup in processed_data_df.itertuples():
    processed_data_nlp.append((tup.id,tup.product,nlp(tup.short_desc)))
    
processed_data_nlp_df = pd.DataFrame(processed_data_nlp, columns=['id','product','short_desc'])
processed_data_nlp_df.head()
del processed_data_nlp

# Calculate the similarity score
similarities_score_list = []

for doc1 in processed_data_nlp_df.itertuples():
    for doc2 in processed_data_nlp_df.itertuples():
        if (doc1.id < doc2.id) and (doc1.product == doc2.product): # if two bug reports belong to the same product then check the similarity
            similarity_score = doc1.short_desc.similarity(doc2.short_desc)
            similarities_score_list.append((doc1.id,doc2.id,similarity_score))         

# Sort based on the score and 
sorted_similarities_score = sorted(similarities_score_list, key=lambda tup: tup[2], reverse=True)
del similarities_score_list

#convert to dataframe
word2vec_similarities_score_df = pd.DataFrame(sorted_similarities_score, columns=['id1','id2','score'] )
del sorted_similarities_score

print("--- %s seconds ---" % (time.time() - start_time)) # show the time of process
word2vec_similarities_score_df.shape

In [0]:
Model-2: Similarity Score - tf-idf

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
start_time = time.time()

X_train = processed_data_df['short_desc']

# Vectorization 
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(X_train) # remember to use the original X_train set

# Calculate the similarity score
tfidf_cosine_similarities = linear_kernel(tfidf)

shape = tfidf_cosine_similarities.shape[0]

tfidf_cosine_similarities_list = []

for index1 in range(shape):
    for index2 in range(shape):
        if index2 > index1:
            tfidf_cosine_similarities_list.append([index1,index2,tfidf_cosine_similarities[index1,index2]]) 
    
# Sort the score list
tfidf_cosine_similarities_sort = sorted(tfidf_cosine_similarities_list, key=lambda tup: tup[2], reverse=True)
del tfidf_cosine_similarities_list

#Conver to dataframe
tfidf_cosine_similarities_score_df = pd.DataFrame(tfidf_cosine_similarities_sort, columns=['id1','id2','score'])
del tfidf_cosine_similarities_sort

# Convert index to id
index_to_id(tfidf_cosine_similarities_score_df)

# Remove record of diff product
remove_diff_product_score(tfidf_cosine_similarities_score_df)

print("--- %s seconds ---" % (time.time() - start_time)) # show the time of process

tfidf_cosine_similarities_score_df.shape

Model-3: Similarity Score - BM24F

In [0]:
from rank_bm25 import BM25Okapi
start_time = time.time()

# preprocess - tokenize the short_desc to token
processed_corpus_list = []
for x in processed_data_df.itertuples():
    short_desc_splited = x.short_desc.split(" ")
    processed_corpus_list.append(short_desc_splited)
    
# Create a MB24 Object with the corpus
bm25 = BM25Okapi(processed_corpus_list)

# Calculat the similarity score for all bug reports
BM24_similarity_score_list = []

for x in processed_data_df.itertuples():
    query = processed_corpus_list[x.Index]
    doc_scores = bm25.get_scores(query)
    BM24_similarity_score_list.append([x.Index,doc_scores])

# Reformating 
shape = doc_scores.shape[0]
BM24_similarity_score_list_2 = []

for index1 in range(shape):
    for index2 in range(shape):
        if index2 > index1:
            BM24_similarity_score_list_2.append([index1,index2,BM24_similarity_score_list[index1][1][index2]])  
            
del BM24_similarity_score_list

# Sort the score list
BM24_similarity_score_sort = sorted(BM24_similarity_score_list_2, key=lambda tup: tup[2], reverse=True)
del BM24_similarity_score_list_2

#Conver to dataframe
BM24_similarity_score_sort_df = pd.DataFrame(BM24_similarity_score_sort, columns=['id1','id2','score'])
del BM24_similarity_score_sort

# Convert index to id
index_to_id(BM24_similarity_score_sort_df)

# Remove record of diff product
remove_diff_product_score(BM24_similarity_score_sort_df)

print("--- %s seconds ---" % (time.time() - start_time)) # show the time of process

BM24_similarity_score_sort_df.shape

Assessment the Model Accuracy

In [0]:
duplicate_org = pd.read_csv('https://zenodo.org/record/400614/files/apache.relations.csv?download=1',sep=',')

In [0]:
word2vec_df = word2vec_similarities_score_df
tfidf_df = tfidf_cosine_similarities_score_df
BM24_df = BM24_similarity_score_sort_df

duplicate_score_table_list = []

for tup in duplicate_org.itertuples():
    word2vec_score = word2vec_df.loc[(word2vec_df['id1'] == tup.id) & (word2vec_df['id2'] == tup.dup),'score'].array[0]
    tfidf_score = tfidf_df.loc[(tfidf_df['id1'] == tup.id) & (tfidf_df['id2'] == tup.dup),'score'].array[0]
    BM24F_score = BM24_df.loc[(BM24_df['id1'] == tup.id) & (BM24_df['id2'] == tup.dup),'score'].array[0]

    short_desc1 = data_org.loc[lambda df: df['id'] == tup.id,'short_desc'].array[0]
    short_desc2 = data_org.loc[lambda df: df['id'] == tup.dup,'short_desc'].array[0]
    
    duplicate_score_table_list.append([tup.id,tup.dup,word2vec_score,tfidf_score,BM24F_score,short_desc1,short_desc2])

duplicate_score_table_df = pd.DataFrame(duplicate_score_table_list, columns=['id1','dup','word2vec_score','tfidf_score','BM24F_score','short_desc1','short_desc2'])
del duplicate_score_table_list

# print("short_desc1\n",short_desc1)
# print("short_desc2\n",short_desc2)
# print("word2vec_score:\t",word2vec_score)
# print("tfidf_score:\t",tfidf_score)
# print("BM24F_score:\t",BM24F_score)

In [0]:
score_table