## Detecting novel messages based on message reconstruction errors

In [9]:
import pandas as pd
import numpy as np
import re
from numpy import save, load
import sqlalchemy as s
import unicodedata
import nltk
import string
from nltk.tokenize import word_tokenize 
from nltk.stem.snowball import SnowballStemmer
from sqlalchemy import create_engine
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

In [2]:
# Connection to local Postgres database

In [7]:
repo_id = 25827

# Fetch PR and issue messages of repo_id
join_SQL = s.sql.text("""
       select message.msg_id, msg_timestamp,  msg_text from augur_data.message
left outer join augur_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id 
left outer join augur_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id
where repo_id = :repo_id 
UNION
select message.msg_id, msg_timestamp, msg_text from augur_data.message
left outer join augur_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id 
left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id
where repo_id = :repo_id""")

df_message = pd.read_sql_query(join_SQL, engine, params={'repo_id': repo_id, 'begin': '2020-03-11 17:21:43'})

In [8]:
df_message.head()

Unnamed: 0,msg_id,msg_timestamp,msg_text
0,1880837,2020-03-11 17:21:43,> Isn't that the point of stabilising an IDE d...
1,2054646,2020-04-06 19:06:06,goodbye friend!
2,1883945,2020-02-28 18:46:02,"> No, at least not intentionally.\r\n\r\nCould..."
3,1883956,2020-03-10 10:54:39,@mauromol Thanks a lot for trying the latest C...
4,1933586,2018-12-05 06:07:18,Doesn't seem to be necessary. Extension pack l...
...,...,...,...
1861,1859406,2020-02-25 11:00:08,"Hey @sandorApati, hard to say remotely what ex..."
1862,1933712,2018-10-09 15:25:58,> I can confirm that the Spring-Data JPA Conte...
1863,1942947,2020-04-06 19:03:29,![image](https://user-images.githubusercontent...
1864,1922858,2020-04-01 16:27:00,Closing because of no response from @reszy wit...


In [5]:
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [6]:
## Preprocessing text

import nltk
import string
from nltk.tokenize import word_tokenize

punc=list(string.punctuation)

stopword = nltk.corpus.stopwords.words('english')
snowBallStemmer = SnowballStemmer("english")

# Expanding contractions
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

# Removing stop words, punctuations, spaces, stemming...
def clean_text(text):
    soup = BeautifulSoup(text)
    for tag in soup.find_all('strong'):
        tag.replaceWith('')
        text = soup.get_text()

    text = re.sub('\n', ' ', text)
    text = re.sub('\r', ' ', text)
    text = re.sub('[()){}]', ' ', text)
    text = re.sub('\<[^<>]*\>', '', text)
    text = re.sub('\`[^``]*\`', '', text)
    
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # Removes personal mentions like @abc, and email addresses
    s = re.sub(r'\w*@\w*', ' ', text)   

    # Removes attached links
    s=' '.join(word for word in s.split(' ') if not word.startswith('http'))   

    # Separates joint words
    # Removes punctuation
    s = re.sub('[\.\-\_\\/&]', ' ', s)
    s = "".join([word.lower() for word in s if word not in punc])
    s = word_tokenize(s)

    # Stemming
    s = " ".join([snowBallStemmer.stem(word) for word in s if len(word)<=30 and word not in stopword])              

    # Tokenization
    s = re.sub('[0-9]+', '', s)
    s = re.sub('lgtm', 'look good', s)
    return s

# Normalize corpus
def normalize_corpus(text,contraction_expansion=True,clean=True):
    if contraction_expansion:
        text = expand_contractions(text)
    if clean:
        text = clean_text(text)
    return text

In [7]:
df_message['cleaned_msg_text'] = df_message['msg_text'].map(lambda x: normalize_corpus(x))

In [8]:
df_message

Unnamed: 0,msg_id,msg_timestamp,msg_text,cleaned_msg_text
0,1880837,2020-03-11 17:21:43,> Isn't that the point of stabilising an IDE d...,point stabilis ide despit usag extern depend l...
1,2054646,2020-04-06 19:06:06,goodbye friend!,goodby friend
2,1883945,2020-02-28 18:46:02,"> No, at least not intentionally.\r\n\r\nCould...",least intent could batch event work areadi don...
3,1883956,2020-03-10 10:54:39,@mauromol Thanks a lot for trying the latest C...,thank lot tri latest ci build attach log great...
4,1933586,2018-12-05 06:07:18,Doesn't seem to be necessary. Extension pack l...,seem necessari extens pack list extens without...
...,...,...,...,...
1861,1859406,2020-02-25 11:00:08,"Hey @sandorApati, hard to say remotely what ex...",hey hard say remot exact go wrong doubl check ...
1862,1933712,2018-10-09 15:25:58,> I can confirm that the Spring-Data JPA Conte...,confirm spring data jpa content assist work an...
1863,1942947,2020-04-06 19:03:29,![image](https://user-images.githubusercontent...,imag mean understand
1864,1922858,2020-04-01 16:27:00,Closing because of no response from @reszy wit...,close respons info tri sort thing recent seem ...


### Training Doc2Vec from custom dataset messages

In [10]:
df_all = pd.read_excel('./mod_train.xlsx', names=['Text','score'])

In [11]:
df_all = df_all.drop(['score'],axis=1)

In [12]:
df_all['cleaned_msg_text'] = df_all['Text'].map(lambda x: normalize_corpus(x))



### Split past present data

In [13]:
df_message['msg_timestamp'] = [d.date() for d in df_message['msg_timestamp']]
df_message = df_message.sort_values(by='msg_timestamp')

In [14]:
df_past = df_message[df_message['msg_timestamp'].astype(str)< '2020-04-01']
df_present = df_message[df_message['msg_timestamp'].astype(str)>= '2020-04-01']

In [15]:
df_present.shape

(566, 4)

In [16]:
df_past.shape

(1300, 4)

### Making word embeddings using Doc2Vec

In [254]:
# Transformer which converts text corpus into document vectors.

from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.parsing.preprocessing import preprocess_string
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from tqdm import tqdm

import multiprocessing
import numpy as np

def build_model(max_epochs, vec_size, alpha, tag_data):    
    model = Doc2Vec(vector_size=vec_size, alpha=alpha,min_alpha=0.00025, min_count=2, dm=1)
    model.build_vocab(tag_data)

    for epoch in range(max_epochs):
        model.train(skl_utils.shuffle(tag_data),
                   total_examples=model.corpus_count,
                   epochs=model.epochs)

        model.alpha -= 0.0002

        model.min_alpha = model.alpha

    model.save("doc2vec.model")
    print("Model Saved")
    return model


In [418]:
modelqq = Doc2Vec.load("doc2vec.model")

In [255]:
df_x = pd.DataFrame(df_all['cleaned_msg_text'])
tag_data = [TaggedDocument(str(row['cleaned_msg_text']).split(), [index]) for index, row in df_x.iterrows()]
# print(tag_data)
model = build_model(max_epochs=100, vec_size=300, alpha=0.01, tag_data=tag_data)
doc2vec_vectors = np.array([model.infer_vector(str(row['cleaned_msg_text']).split())for index, row in df_past.iterrows()])

Model Saved


In [256]:
model.wv.similar_by_word('bad')

[('scream', 0.48170191049575806),
 ('ugh', 0.4529797434806824),
 ('typedef', 0.44843924045562744),
 ('terribl', 0.44335469603538513),
 ('commitupdatecommand', 0.4413297772407532),
 ('readabl', 0.44015777111053467),
 ('unsaf', 0.4396229684352875),
 ('blooper', 0.435542494058609),
 ('indexof', 0.4317985773086548),
 ('slerp', 0.43164515495300293)]

### Using autoencoders

In [269]:
from keras.layers import Input, Dense
from keras.models import Model

def autoencoder(vec_input, train):

    input_dim = Input(shape = (vec_input, ))
    encoded1 = Dense(vec_input//2, activation = 'sigmoid')(input_dim)
    encoded2 = Dense(1, activation = 'sigmoid')(encoded1)

    # Decoder Layers
    decoded1 = Dense(vec_input//2, activation = 'tanh')(encoded2)
    decoded2 = Dense(vec_input, activation = 'tanh')(decoded1)

    # Combine Encoder and Deocder layers
    model = Model(inputs = input_dim, outputs = decoded2)

    # Compile the Model
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])
    model.summary()
    model.fit(train, train, epochs = 50)
    return model

In [270]:
ae1 = autoencoder(300, doc2vec_vectors)
pred_train = ae1.predict(doc2vec_vectors)

Model: "model_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_19 (InputLayer)        (None, 300)               0         
_________________________________________________________________
dense_70 (Dense)             (None, 150)               45150     
_________________________________________________________________
dense_71 (Dense)             (None, 1)                 151       
_________________________________________________________________
dense_72 (Dense)             (None, 150)               300       
_________________________________________________________________
dense_73 (Dense)             (None, 300)               45300     
Total params: 90,901
Trainable params: 90,901
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/

### Calculate reconstruction error & otsu threshold

In [296]:
def reconstruction(pred, val):
    rec_error = []
    for i in range(len(pred)):
        rec_error.append(np.linalg.norm(pred[i] - val[i]))
    rec_error = np.array(rec_error)
    return rec_error

def thresholding(rec_error, val):
    threshold = threshold_otsu(rec_error)
    normals = []
    for i in range(len(rec_error)):
        if rec_error[i] < threshold:
            normals.append(val[i])
    normals = np.array(normals)
    return threshold, normals

In [279]:
len(doc2vec_vectors)

1300

### Identify normal data from 1st AE

In [280]:
rec_error1 = reconstruction(pred_train, doc2vec_vectors)
threshold1, normal_data = thresholding(rec_error1, doc2vec_vectors)

In [281]:
len(normals)

1054

In [32]:
from sklearn.externals import joblib
# Save to file in the current working directory
joblib_file = "{r_id}_uniq.pkl".format(r_id = repo_id)
joblib.dump(auto_encoder, joblib_file)



['25827_uniq.pkl']

In [282]:
# Fitting on present data
doc2vec_vectors_test = np.array([model.infer_vector(str(row['cleaned_msg_text']).split())for index, row in df_present.iterrows()])

### Train 2nd AE with normal data alone

In [284]:
ae2 = autoencoder(300, normal_data)
predicted_vectors1 = ae2.predict(doc2vec_vectors)

Model: "model_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_21 (InputLayer)        (None, 300)               0         
_________________________________________________________________
dense_78 (Dense)             (None, 150)               45150     
_________________________________________________________________
dense_79 (Dense)             (None, 1)                 151       
_________________________________________________________________
dense_80 (Dense)             (None, 150)               300       
_________________________________________________________________
dense_81 (Dense)             (None, 300)               45300     
Total params: 90,901
Trainable params: 90,901
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/

In [292]:
rec_error2 = reconstruction(predicted_vectors1, doc2vec_vectors)
threshold2, normal_data = thresholding(rec_error2, doc2vec_vectors)

In [293]:
threshold2

0.20612182

### Calculate novel count on present data

In [362]:
predicted_vectors_test = ae2.predict(doc2vec_vectors_test)
rec_error_test = reconstruction(predicted_vectors_test, doc2vec_vectors_test)
c=0
for i in range(len(rec_error_test)):
    if rec_error_test[i] > threshold2:
        c+=1

### Cosine similarity analysis

In [298]:
# Determining the cosine similarity between predicted and Doc2Vec vectors, to get reconstruction errors

from scipy.spatial.distance import cosine

def key_cosine_similarity(tupple):
    return tupple[1]

def get_computed_similarities(vectors, predicted_vectors, reverse=False):
    data_size = len(df_present)
    cosine_similarities = []
    cosine_sim_values = []
    for i in range(data_size):
        cosine_sim_val = (1 - cosine(vectors[i], predicted_vectors[i]))
        cosine_similarities.append((df_present['msg_id'].iloc[i], cosine_sim_val))
        cosine_sim_values.append(cosine_sim_val)
    
    df_present['uniqueness_score'] = cosine_sim_values
    return sorted(cosine_similarities, key=key_cosine_similarity,reverse=reverse)

def display_unique(sorted_cosine_similarities):
    i=0
    unique_message_list=[]
    cos_val = []
    index, cosine_sim_val = sorted_cosine_similarities[0]
    while cosine_sim_val<=-0.1:
        if cosine_sim_val not in cos_val:
            unique_message_list.append(index)
            cos_val.append(cosine_sim_val)
            print('Message id: ', index)  
            print('Cosine Sim Val :', cosine_sim_val)
        i+=1    
        index, cosine_sim_val = sorted_cosine_similarities[i]
        
    return unique_message_list

In [299]:
# Fetching message IDs with cosine similarity <= -0.09

sorted_cosine_similarities = get_computed_similarities(vectors=doc2vec_vectors_test, predicted_vectors=predicted_vectors_test)
unique_message_list = display_unique(sorted_cosine_similarities)


Message id:  2012691
Cosine Sim Val : -0.2495095133781433
Message id:  2047053
Cosine Sim Val : -0.22886519134044647
Message id:  2032146
Cosine Sim Val : -0.21972420811653137
Message id:  2051407
Cosine Sim Val : -0.21733559668064117
Message id:  2054638
Cosine Sim Val : -0.1771014779806137
Message id:  1920180
Cosine Sim Val : -0.17360268533229828
Message id:  2085361
Cosine Sim Val : -0.16967616975307465
Message id:  2012725
Cosine Sim Val : -0.1646515429019928
Message id:  1933596
Cosine Sim Val : -0.14073914289474487
Message id:  1942982
Cosine Sim Val : -0.137152761220932
Message id:  2069007
Cosine Sim Val : -0.13674315810203552
Message id:  2059090
Cosine Sim Val : -0.13608397543430328
Message id:  2078076
Cosine Sim Val : -0.11127150803804398
Message id:  2012726
Cosine Sim Val : -0.10907010734081268
Message id:  2059111
Cosine Sim Val : -0.1054145097732544
Message id:  2073893
Cosine Sim Val : -0.10324677079916
Message id:  1942946
Cosine Sim Val : -0.10256258398294449
Messag

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [300]:
# Estimated count of unique messages
print('Unique messages count: '+str(len(unique_message_list)))

Unique messages count: 19


In [301]:
# List of anomalous messages
messages = df_present[df_present['msg_id'].isin(unique_message_list)]['msg_text'].tolist()
message_dates = df_present[df_present['msg_id'].isin(unique_message_list)]['msg_timestamp'].tolist()

In [314]:
messages[12]

'Hi Martin, \r\nI could retest my app, and STS does exactly, what I was looking for. I see my cxf path, my servlet produced by ServletRegistrationBean, and also added a Servlet which is registered by @ServletComponentScan - all three appear in Request Mapping view and in the path search.\r\nThanks again, looking forward to install the stable version!!!'