### Sentiment analysis on messages

In [1]:
import pandas as pd
import numpy as np
import re
from numpy import save, load
import sqlalchemy as s
import unicodedata
import time
from sqlalchemy import create_engine
# nltk.download('all')

In [2]:
# Connection to local Postgres database

In [3]:
repo_id = 25774

# Fetch PR and issue messages of repo_id
join_SQL = s.sql.text("""
       select message.msg_id, msg_timestamp,  msg_text from augur_data.message
left outer join augur_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id 
left outer join augur_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id
where repo_id = :repo_id
UNION
select message.msg_id, msg_timestamp, msg_text from augur_data.message
left outer join augur_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id 
left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id
where repo_id = :repo_id
""")

# Transfer to Pandas df
df_message = pd.read_sql_query(join_SQL, engine, params={'repo_id': repo_id})

In [4]:
df_message

Unnamed: 0,msg_id,msg_timestamp,msg_text
0,1732709,2014-03-10 21:20:54,"Hi Artem, can you take another pass over this ..."
1,1734278,2014-05-01 17:01:48,Found one thing I want to change before review...
2,1728841,2013-09-23 15:18:57,We probably need to look into some options to ...
3,1727849,2013-08-12 17:35:10,Merged after review and testing. The tab compl...
4,1731943,2014-01-28 14:13:52,"> Also, looking at JobsController, those are n..."
...,...,...,...
4417,1742283,2015-04-24 23:15:38,"LGTM, rebased and merged into master 8a40e9d22..."
4418,1740237,2015-01-14 15:21:06,LGTM Starting Acceptance Test Pass.\n
4419,1726954,2013-07-23 17:52:57,I meant that your solution does modules->dirt....
4420,1736136,2014-06-23 18:10:57,Good catch. LGTM; merging.\n


In [5]:
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [6]:
## Preprocessing text

import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from bs4 import BeautifulSoup

punc=list(string.punctuation)
# Retaining to identify personal mentions, sentiment
punc.remove('!')
punc.remove('?')

stopword = nltk.corpus.stopwords.words('english')
snowBallStemmer = SnowballStemmer("english")

# Expanding contractions
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

# Removing stop words, punctuations, spaces, stemming...
def clean_text(text, remove_emojis):
    soup = BeautifulSoup(text)
    for tag in soup.find_all('strong'):
        tag.replaceWith('')
        text = soup.get_text()

    text = re.sub('\n', ' ', text)
    text = re.sub('\r', ' ', text)
    text = re.sub('[()){}]', ' ', text)
    text = re.sub('\<[^<>]*\>', '', text)
    text = re.sub('\`[^``]*\`', '', text)
    
    # emojis, ?, !, no/not are retained for sentiment analysis else removed
    if remove_emojis:
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        punc.extend(['!','?'])
        stopword.extend(['no','not'])
    # Removes personal mentions like @abc, and email addresses
    s = re.sub(r'\w*@\w*', ' ', text)   

    # Removes attached links
    s=' '.join(word for word in s.split(' ') if not word.startswith('http'))   

    # Separates joint words
    # Removes punctuation
    s = re.sub('[\.\-\_\\/&]', ' ', s)
    s = "".join([word.lower() for word in s if word not in punc])
    s = word_tokenize(s)

    # Stemming and removing commit hashes
    if remove_emojis:
        s = " ".join([snowBallStemmer.stem(word) for word in s if len(word)<=30])
    else:
        s = " ".join([word for word in s if len(word)<=30])

    # Tokenization
    s = re.sub('[0-9]+', '', s)
    s = re.sub('lgtm', 'look good', s)
    return s

# Normalize corpus
def normalize_corpus(text,contraction_expansion=True,clean=True,remove_emojis=False):
    if contraction_expansion:
        text = expand_contractions(text)
    if clean:
        text = clean_text(text,remove_emojis)
    return text

In [7]:
df_message['cleaned_msg_text'] = df_message['msg_text'].map(lambda x: normalize_corpus(x))

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


In [8]:
df_message

Unnamed: 0,msg_id,msg_timestamp,msg_text,cleaned_msg_text
0,1732709,2014-03-10 21:20:54,"Hi Artem, can you take another pass over this ...",hi artem can you take another pass over this i...
1,1734278,2014-05-01 17:01:48,Found one thing I want to change before review...,found one thing i want to change before review
2,1728841,2013-09-23 15:18:57,We probably need to look into some options to ...,we probably need to look into some options to ...
3,1727849,2013-08-12 17:35:10,Merged after review and testing. The tab compl...,merged after review and testing the tab comple...
4,1731943,2014-01-28 14:13:52,"> Also, looking at JobsController, those are n...",also looking at jobscontroller those are not h...
...,...,...,...,...
4417,1742283,2015-04-24 23:15:38,"LGTM, rebased and merged into master 8a40e9d22...",look good rebased and merged into master cherr...
4418,1740237,2015-01-14 15:21:06,LGTM Starting Acceptance Test Pass.\n,look good starting acceptance test pass
4419,1726954,2013-07-23 17:52:57,I meant that your solution does modules->dirt....,i meant that your solution does modules dirt h...
4420,1736136,2014-06-23 18:10:57,Good catch. LGTM; merging.\n,good catch look good merging


In [9]:
# Sentiment analysis using vader lexicon

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment_vader_lexicon(statement, positive_threshold=0.5,negative_threshold=-0.5):   
    # analyze the compound sentiment for statement
    score = analyzer.polarity_scores(statement)['compound']
    # -1:negative, 0:neutral, 1:positive
    if score >= positive_threshold:
        sentiment = 1
    else:
        sentiment = -1 if score< negative_threshold else 0
    return score,sentiment

start_time = time.time()
siz = df_message.shape[0]
i = 0
pred = []
val = []
while (i<siz):
    score, sentiment = analyze_sentiment_vader_lexicon(df_message.iloc[i]['cleaned_msg_text'], positive_threshold=0.5, negative_threshold=-0.4)
    val.append(score)
    pred.append(sentiment)
    i+=1
val = np.array(val)
pred = np.array(pred)
print("--- %s seconds ---" % (time.time() - start_time))

df_message['senti_score'] = val
df_message['senti'] = pred

--- 1.5860824584960938 seconds ---


In [10]:
df_message.groupby('senti').count()

Unnamed: 0_level_0,msg_id,msg_timestamp,msg_text,cleaned_msg_text,senti_score
senti,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1,238,238,238,238,238
0,3196,3196,3196,3196,3196
1,988,988,988,988,988


In [11]:
# Sentiment analysis using textblob

from textblob import TextBlob

# Get the polarity score using below function
def get_textBlob_score(statement, positive_threshold=0.5,negative_threshold=-0.5):
    score = TextBlob(statement).sentiment.polarity
    # -1:negative, 0:neutral, 1:positive
    if score >= positive_threshold:
        sentiment = 1
    else:
        sentiment = -1 if score < negative_threshold else 0
    return score,sentiment

start_time = time.time()
siz = df_message.shape[0]
i = 0
pred = []
val = []
while (i<siz):
    score, sentiment = get_textBlob_score(df_message.iloc[i]['cleaned_msg_text'], positive_threshold=0.5, negative_threshold=-0.3)    
    val.append(score)
    pred.append(sentiment)
    i+=1
val = np.array(val)
pred = np.array(pred)
print("--- %s seconds ---" % (time.time() - start_time))    

df_message['txtblob_senti_score'] = val
df_message['txtblob_senti'] = pred

--- 1.541060447692871 seconds ---


In [12]:
df_message.groupby('txtblob_senti').count()

Unnamed: 0_level_0,msg_id,msg_timestamp,msg_text,cleaned_msg_text,senti_score,senti,txtblob_senti_score
txtblob_senti,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
-1,76,76,76,76,76,76,76
0,3582,3582,3582,3582,3582,3582,3582
1,764,764,764,764,764,764,764


In [13]:
df_message

Unnamed: 0,msg_id,msg_timestamp,msg_text,cleaned_msg_text,senti_score,senti,txtblob_senti_score,txtblob_senti
0,1732709,2014-03-10 21:20:54,"Hi Artem, can you take another pass over this ...",hi artem can you take another pass over this i...,0.4404,0,0.200000,0
1,1734278,2014-05-01 17:01:48,Found one thing I want to change before review...,found one thing i want to change before review,0.0772,0,0.000000,0
2,1728841,2013-09-23 15:18:57,We probably need to look into some options to ...,we probably need to look into some options to ...,0.8055,1,0.500000,1
3,1727849,2013-08-12 17:35:10,Merged after review and testing. The tab compl...,merged after review and testing the tab comple...,0.7614,1,0.515000,1
4,1731943,2014-01-28 14:13:52,"> Also, looking at JobsController, those are n...",also looking at jobscontroller those are not h...,0.5423,1,0.083333,0
...,...,...,...,...,...,...,...,...
4417,1742283,2015-04-24 23:15:38,"LGTM, rebased and merged into master 8a40e9d22...",look good rebased and merged into master cherr...,0.4404,0,0.700000,1
4418,1740237,2015-01-14 15:21:06,LGTM Starting Acceptance Test Pass.\n,look good starting acceptance test pass,0.7096,1,0.350000,0
4419,1726954,2013-07-23 17:52:57,I meant that your solution does modules->dirt....,i meant that your solution does modules dirt h...,0.9372,1,0.066009,0
4420,1736136,2014-06-23 18:10:57,Good catch. LGTM; merging.\n,good catch look good merging,0.7003,1,0.700000,1


In [14]:
# Consider vader/textblob and do trend anlysis..