In [256]:
# Model-1-V3

In [257]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [258]:
data = pd.read_csv('https://zenodo.org/record/400614/files/apache.csv?download=1',sep=',')

In [260]:
data.head()

Unnamed: 0,id,product,component,reporter,bug_status,resolution,priority,bug_severity,version,short_desc,opendate,dup_list,root_id,disc_id
0,2,Log4j -,Layout,bugzilla,CLOS,FIXE,P3,nor,unspe,Just testing the Boogzeela setup for log4j,2001-01-08,,,
1,3,Log4j -,Appender,matthew_scully,RESO,FIXE,P3,nor,unspe,file sharing,2001-01-08,,,
2,22,Apache h,All,greenrd,CLOS,WONT,P5,enh,2.0-H,Need hooks for user-defined error handling,2001-01-09,,,
3,27,Log4j -,Layout,richard.mccarthy,RESO,INVA,P1,nor,1.0,Error in the formatting of the (%F:%L) layout ...,2001-01-10,,,
4,29,Log4j -,Other,woge,RESO,FIXE,P3,nor,1.0,"Calling Category.error(Object, Throwable) thro...",2001-01-11,,,


**Remove NaN from the short_desc**

In [261]:
data.isnull().sum()

id                  0
product             0
component           0
reporter            0
bug_status          0
resolution          0
priority            0
bug_severity        0
version             0
short_desc         58
opendate            0
dup_list        41531
root_id         38016
disc_id         38016
dtype: int64

In [263]:
df = data[['id','product','short_desc','dup_list']]
df.head()

Unnamed: 0,id,product,short_desc,dup_list
0,2,Log4j -,Just testing the Boogzeela setup for log4j,
1,3,Log4j -,file sharing,
2,22,Apache h,Need hooks for user-defined error handling,
3,27,Log4j -,Error in the formatting of the (%F:%L) layout ...,
4,29,Log4j -,"Calling Category.error(Object, Throwable) thro...",


Remove Blank Records

In [264]:
blanks = []

for index,id,product,short_desc,dup_list in df.itertuples():
    if type(short_desc)==str:
        if short_desc.isspace():
            blanks.append(index)
    else:
        blanks.append(index)
len(blanks)  

58

In [265]:
df.drop(blanks,inplace=True)
df.isnull().sum()

id                0
product           0
short_desc        0
dup_list      41474
dtype: int64

**Similarity Methods:**

In [266]:
#conda install -c conda-forge spacy
#conda install -c conda-forge spacy-lookups-data
#!python -m spacy download en_core_web_lg

In [267]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_lg')

In [330]:
def tokenize(df,id):
    s = df[df['id'] == int(id)]
    s = s.short_desc.to_string(index=False)
    s = s[1:]
    # print ('\n'+ s)
    doc = nlp(s)
    #for token in doc:
    #    print(token.text, end=' | ')
    #displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})    
    return doc

In [281]:
doc1 = tokenize(df,355)
doc2 = tokenize(df,475)

**Remove Puntuations**

In [307]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{20}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [308]:
#Remove Puntuations
def remove_puntuation(doc):
    #show_lemmas(doc)
    s = []
    
    for token in doc:
        if token.is_punct == False:
            s.append(token) 
            
    #Convert list to string
    str1 = ""    
    for ele in s:  
        str1 += str(ele) +' '    
        
    return nlp(str1[:-1]) #return doc 

In [309]:
doc_removed_puntuation = remove_puntuation(doc1)

In [310]:
doc1.text

"request.getCookies() doesn't work BugRat Repor..."

In [311]:
doc_removed_puntuation.text

"request.getCookies does n't work BugRat Repor"

In [312]:
doc2 = remove_puntuation(doc2)

In [313]:
show_lemmas(doc1)

request.getCookies   NOUN   3252721639640231478    request.getcookie
(                    PUNCT  12638816674900267446   (
)                    PUNCT  3842344029291005339    )
does                 AUX    2158845516055552166    do
n't                  PART   447765159362469301     not
work                 VERB   10038440415813069799   work
BugRat               PROPN  7680755314642684400    BugRat
Repor                PROPN  9807858581063934278    Repor
...                  PUNCT  10875615029400813363   ...


**Remove Stop Words**

In [314]:
#Remove Stop Words
def remove_stop_word(doc): # accept string
    s = []
    for token in doc:
        if token.is_stop == False:
            s.append(token) 
    #Convert list to string
    str1 = ""    
    for ele in s:  
        str1 += str(ele) +' '     
    return nlp(str1[:-1]) #return doc 

In [315]:
doc1 = remove_stop_word(doc1)
doc1.text

'request.getCookies ( ) work BugRat Repor ...'

In [316]:
doc2 = remove_stop_word(doc2)
doc2.text

'SimpleTcpConnector work BugRat Report#802'

In [317]:
print('Similarity: ' + str(doc1.similarity(doc2)))

Similarity: 0.5940727380955374


**Build a function of all previous work**

In [318]:
# Put all prevouse work together in a function
def prep_func(df,id):
    s = tokenize(df,id)
    s = remove_puntuation(s)
    doc = remove_stop_word(s)
    return doc

In [332]:
# Prepare the vectors for each document
import time

start_time = time.time()
c = 0
sample = 1000 # Number of samples
prepared = []
l = len(df.index)

#prepared = map(prep_func, df, df['id'])

for i,id,pro,sd,db in df.itertuples():
    doc = prep_func(df,id)
    prepared.append((id,pro,doc))   
    # Show the progress in the output
    c = c + 1
    if c % 100 == 0:
        print('Progress: %s %%' % round(c/sample*100, 2), end="\r", flush=True)
    if c > sample:
        print('\nDone')
        break
        
print("--- %s seconds ---" % (time.time() - start_time))  # show the time of process

Progress: 100.0 %
Done
--- 30.052579879760742 seconds ---


In [333]:
#prepared = list(prepared)

In [334]:
c = 0
for i in prepared:
    print(i[0], '\t',i[1], '\t', i[2].text)
    c = c + 1
    if c > 5:
        break

2 	 Log4j - 	 testing Boogzeela setup log4j
3 	 Log4j - 	 file sharing
22 	 Apache h 	 Need hooks user defined error handling
27 	 Log4j - 	 Error formatting F:%L layout
29 	 Log4j - 	 Calling Category.error(Object Throwable thro
31 	 Log4j - 	 Conversion log4j Priority types NT Event


In [122]:
# Calculate one by one similarity score
# x[0]: id , x[1]: product, x[2]: short_desc

start_time = time.time()
similarities = []
l = len(prepared)
c = 0

for i in prepared:
    c = c + 1
    for j in prepared:
        if i[0] != j[0] and i[1] == j[1]: # if two bug reports belong to the same product then check the similarity
            score = i[2].similarity(j[2])
            similarities.append((i[0],j[0],score))
            
    # Show the progress in the output        
    if c % 10 == 0:
        progress = c/l*100
        print('Progress: %s %%' % round(progress, 1), end="\r", flush=True)
print('\nDone')
print("--- %s seconds ---" % (time.time() - start_time)) # show the time of process

Progress: 99.8 %
Done
--- 76.72808694839478 seconds ---


In [252]:
# Sort based on the score
sorted_similarities = sorted(similarities, key=lambda tup: tup[2], reverse=True)
result_df = pd.DataFrame(sorted_similarities, columns=['id1','id2','score'] )
result_df.head()

Unnamed: 0,id1,id2,score
0,357,385,1.0
1,385,357,1.0
2,2,2,1.0
3,3,3,1.0
4,22,22,1.0


In [250]:
result = {}
for i,id1,id2,score in result_df.itertuples():
    result = 
    
result_df.to_dict()

TypeError: list expected at most 1 arguments, got 2

In [249]:
len(result)

1

**Test the outputs**

In [227]:
def test(df,id1,id2):
    d1 = tokenize(df,id1)
    print('s1: ' + d1.text)
    d2 = tokenize(df,id2)
    print('s2: ' + d2.text)
    d1 = remove_puntuation(d1)
    d2 = remove_puntuation(d2)
    doc1 = remove_stop_word(d1)
    print('\nDoc1: ' + doc1.text + '\t')
    doc2 = remove_stop_word(d2)
    print('Doc2: ' + doc2.text + '\t')
    print('\nSimilarity: ' + str(doc1.similarity(doc2)))

In [236]:
test(df,317,341)

s1: Tomcat 3.2 reports 404 when submitting a jsp i...
s2: Tomcat 3.2 reports 404 when submitting a jsp B...

Doc1: Tomcat 3.2 reports 404 submitting jsp	
Doc2: Tomcat 3.2 reports 404 submitting jsp B	

Similarity: 0.9673615744389031


**Assessment of the accuracy**