### Sentiment analysis on messages

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import re
from numpy import save, load
import sqlalchemy as s
import unicodedata
import time
from sqlalchemy import create_engine

# Importing SentiCR
from Sentiment import SentiCR
# Perform training or used trained model if exists. 
# Can specify param: algo here like XGB, GBT (eventually best will be used)
sentiment_analyzer = SentiCR(algo = 'XGB' )


Using TensorFlow backend.


Using default train set
Reading data from oracle db..
Training classifier model..
Preprocessing done
Tfidf done
Training done


In [2]:
# Connection to local Postgres database

In [3]:
repo_id = 25774

# Fetch PR and issue messages of repo_id
join_SQL = s.sql.text("""
       select message.msg_id, msg_timestamp,  msg_text from augur_data.message
left outer join augur_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id 
left outer join augur_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id
where repo_id = :repo_id
UNION
select message.msg_id, msg_timestamp, msg_text from augur_data.message
left outer join augur_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id 
left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id
where repo_id = :repo_id
""")

# Transfer to Pandas df
df_message = pd.read_sql_query(join_SQL, engine, params={'repo_id': repo_id})

In [4]:
df_message

Unnamed: 0,msg_id,msg_timestamp,msg_text
0,1732709,2014-03-10 21:20:54,"Hi Artem, can you take another pass over this ..."
1,1734278,2014-05-01 17:01:48,Found one thing I want to change before review...
2,1728841,2013-09-23 15:18:57,We probably need to look into some options to ...
3,1727849,2013-08-12 17:35:10,Merged after review and testing. The tab compl...
4,1731943,2014-01-28 14:13:52,"> Also, looking at JobsController, those are n..."
...,...,...,...
4417,1742283,2015-04-24 23:15:38,"LGTM, rebased and merged into master 8a40e9d22..."
4418,1740237,2015-01-14 15:21:06,LGTM Starting Acceptance Test Pass.\n
4419,1726954,2013-07-23 17:52:57,I meant that your solution does modules->dirt....
4420,1736136,2014-06-23 18:10:57,Good catch. LGTM; merging.\n


In [5]:
# Function to get sentiment score
def get_senti_score(df,col):
    start_time = time.time()
    siz = df.shape[0]
    i = 0
    val = []
    while (i<siz):
        score = sentiment_analyzer.get_sentiment_polarity(df.iloc[i][col])
        val.append(score)
        i+=1
    val = np.array(val)
    df['pred_senti'] = val
    print("--- %s seconds ---" % (time.time() - start_time))    


In [6]:
# Getting senti score on our data
get_senti_score(df_message,'msg_text')

--- 10.944270372390747 seconds ---


In [7]:
df_message

Unnamed: 0,msg_id,msg_timestamp,msg_text,pred_senti
0,1732709,2014-03-10 21:20:54,"Hi Artem, can you take another pass over this ...",1.0
1,1734278,2014-05-01 17:01:48,Found one thing I want to change before review...,0.0
2,1728841,2013-09-23 15:18:57,We probably need to look into some options to ...,0.0
3,1727849,2013-08-12 17:35:10,Merged after review and testing. The tab compl...,1.0
4,1731943,2014-01-28 14:13:52,"> Also, looking at JobsController, those are n...",0.0
...,...,...,...,...
4417,1742283,2015-04-24 23:15:38,"LGTM, rebased and merged into master 8a40e9d22...",1.0
4418,1740237,2015-01-14 15:21:06,LGTM Starting Acceptance Test Pass.\n,1.0
4419,1726954,2013-07-23 17:52:57,I meant that your solution does modules->dirt....,0.0
4420,1736136,2014-06-23 18:10:57,Good catch. LGTM; merging.\n,1.0


In [8]:
df_message.groupby('pred_senti').count()

Unnamed: 0_level_0,msg_id,msg_timestamp,msg_text
pred_senti,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.0,207,207,207
0.0,3274,3274,3274
1.0,941,941,941


In [6]:
# Getting scores on custom test set
df_test = pd.read_csv('mod_test.csv')
get_senti_score(df_test,'Text')

--- 1.3393032550811768 seconds ---


In [10]:
# Results using GradientBoostingTree as classifier
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df_test['score'], df_test['pred_senti'])
print('Accuracy: '+str(accuracy))

Accuracy: 0.8908382066276803


In [7]:
# Results using XGB as classifier
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df_test['score'], df_test['pred_senti'])
print('Accuracy: '+str(accuracy))

Accuracy: 0.8635477582846004
