## Sentiment analysis on messages using modified SentiCR

#### Among the various classifiers tried, XGBoost and GradientBoostingClassifier performed best with accuracies ~0.86
#### Currently sentiment labels are being predicted: -1 -Negative, 0 -Neutral, 1 -Positive

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import re
from numpy import save, load
import sqlalchemy as s
import unicodedata
import time
from sqlalchemy import create_engine

# Importing SentiCR
from Sentiment import SentiCR
# Perform training or use trained model if exists. 
sentiment_analyzer = SentiCR(algo = 'XGB' )


In [None]:
# Connection to local Postgres database

with open("config.json") as config_file:
    config = json.load(config_file)

database_connection_string = 'postgres+psycopg2://{}:{}@{}:{}/{}'.format(config['user'], config['password'], config['host'], config['port'], config['database'])

dbschema='augur_data'
engine = salc.create_engine(
    database_connection_string,
    connect_args={'options': '-csearch_path={}'.format(dbschema)})

In [None]:
repo_id = 25774

# Fetch PR and issue messages of repo_id
join_SQL = s.sql.text("""
       select message.msg_id, msg_timestamp,  msg_text from augur_data.message
left outer join augur_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id 
left outer join augur_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id
where repo_id = :repo_id
UNION
select message.msg_id, msg_timestamp, msg_text from augur_data.message
left outer join augur_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id 
left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id
where repo_id = :repo_id
""")

# Transfer to Pandas df
df_message = pd.read_sql_query(join_SQL, engine, params={'repo_id': repo_id})

In [None]:
df_message

### Get predicted sentiment labels

In [None]:
# Function to get sentiment score
def get_senti_score(df,col,label=False):
    start_time = time.time()
    siz = df.shape[0]
    i = 0
    labels = []
    scores = []
    while (i<siz):
        if label:
            x, y = sentiment_analyzer.get_sentiment_polarity(df.iloc[i][col],label)
            labels.append(x)
            scores.append(y)
        else:
            score = sentiment_analyzer.get_sentiment_polarity(df.iloc[i][col],label)
            scores.append(score)
        i+=1
    scores = np.array(scores)
    labels = np.array(labels)
    print("--- %s seconds ---" % (time.time() - start_time))
    if label:
        return (labels,scores)
    return scores

In [None]:
# Getting senti score on our data
df_message['senti_label'], df_message['senti_score'] = get_senti_score(df_message,'msg_text',label=True)

In [None]:
# Getting scores on custom test set
df_test = pd.read_csv('mod_test.csv')
df_test['pred_senti_label'],df_test['pred_senti_score'] = get_senti_score(df_test,'Text',label=True)

### Evaluating on custom test set

In [None]:
# Results using XGB as classifier

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df_test['score'], df_test['pred_senti_label'])
print('Accuracy: '+str(accuracy))

In [None]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(df_test['score'], df_test['pred_senti_label'])
print('Confusion Matrix\n')
print(confusion)

In [None]:
df_test.groupby('score').count()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(df_test['score'], df_test['pred_senti_label'])))

print('Micro Precision: {:.2f}'.format(precision_score(df_test['score'], df_test['pred_senti_label'], average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(df_test['score'], df_test['pred_senti_label'], average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(df_test['score'], df_test['pred_senti_label'], average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(df_test['score'], df_test['pred_senti_label'], average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(df_test['score'], df_test['pred_senti_label'], average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(df_test['score'], df_test['pred_senti_label'], average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(df_test['score'], df_test['pred_senti_label'], average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(df_test['score'], df_test['pred_senti_label'], average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(df_test['score'], df_test['pred_senti_label'], average='weighted')))

from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(df_test['score'], df_test['pred_senti_label'], target_names=['Negative', 'Neutral', 'Positive']))

In [None]:
df_message.to_csv(f'senti_repo_{repo_id}.csv', index=False)

In [None]:
df_message

### Proceed to timeseries analysis using sentiment trends

In [None]:
df = df_message.copy()
df['date'] =  pd.to_datetime(df["msg_timestamp"])
df = df.drop(['msg_timestamp'],axis=1)

In [None]:
df['date'].nunique()

### 1. Grouping sentiments weekly for visualization

In [None]:
df = df.sort_values(by='date')
df1 = df.groupby(pd.Grouper(key='date', freq="w"))['senti_label'].value_counts().unstack()

In [None]:
df1 = df1.fillna(0)
df1['total'] = df1.sum(axis=1)
df1.columns = ['Negative','Neutral','Positive','Total']
df1 = df1[df1['Positive']+df1['Negative']!=0]

In [None]:
df1

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(20,10))
ax.plot(df1.index, df1['Neutral'], color='blue', label = 'Neutral')
ax.plot(df1.index, df1['Positive'], color='green', label = 'Positive')
ax.plot(df1.index, df1['Negative'], color='orange', label = 'Negative') 
plt.title('Weekly Sentiment trend', fontsize=14)
plt.legend()
plt.show();

### 2. Anomaly detection based on trend

#### Isolation forest applied to 2 features: Ratio of positive sentiment to the total and negative sentiment to total
#### Ratio is being considered in order to take into account the total messages at any time

In [None]:
df1['PosR'] = df1['Positive']/df1['Total']
df1['NegR'] = df1['Negative']/df1['Total']

df1 = df1[df1['Positive']+df1['Negative']!=0]
df1['PNRatio'] = df1['Positive']/df1['Negative']


from sklearn.ensemble import IsolationForest

features = ['PosR','NegR']
clf = IsolationForest(n_estimators=100, max_samples='auto', max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
clf.fit(df1[features])

pred = clf.predict(df1[features])
df1['anomaly']=pred

anomaly = df1.loc[df1['anomaly'] == -1]

import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(20,10))
ax.plot(df1.index, df1['Total'], color='blue', label = 'Normal')
ax.plot(df1.index, df1['Positive'], color='green', label = 'Positive')
ax.plot(df1.index, df1['Negative'], color='orange', label = 'Negative') 
ax.scatter(anomaly.index,anomaly['Total'], color='red', label = 'Anomaly')
plt.title('Anomaly in sentiment trend', fontsize=14)
plt.legend()
plt.show();


-------------------------------------

### 3. Next step is to calculate the overall sentiment score for every issue & PR

#### Study the progress in the sentiment over the lifecycle of the issue and PR
#### This will also be used as an additional feature for qualitatively analyzing the issues and PRs of every repo