## Preparing a custom annotated dataset for sentiment analysis

In [1]:
import pandas as pd
import numpy as np
import re
from numpy import save, load

#### Considered the JIRA and oracle datasets
#### Jira from SentiEmoji, oracle from SentiCR to incorporate technical jargon
#### StackOverflow sentiments from SentiEmoji to incorporate general sentiment

In [61]:
oracle = pd.read_excel('./oracle.xlsx', names=['Text','score'])
jira = pd.read_csv('Jira.txt', sep="\t", header=None,names=['Text','score'])
so = pd.read_csv('StackOverflow.txt', sep="\t", header=None,names=['Text','score'])

In [62]:
oracle

Unnamed: 0,Text,score
0,- Should be like below:\ntextDirection = SWT.A...,0
1,"""""""create a vdsm.config.config clone, modified...",0
2,"""Add test(s) performing the static code analys...",0
3,"""apt-get"" is distro specific... perhaps make i...",0
4,"""easy"" is marketing; let the code speak for it...",0
...,...,...
1594,you'll need someone with some maven experience...,0
1595,Your memory is too smalll. Consider buying a R...,0
1596,You're preforming this check multiple times.\n...,0
1597,"You're right. Ivan,tenant_id is non-admin tena...",0


In [63]:
so

Unnamed: 0,Text,score
0,"Not without logging or tracing, I'm afraid",negative
1,Already answered to wrap it but here's an . Go...,positive
2,I want to be able to start External Applicatio...,neutral
3,If you also want to display the date it can be...,neutral
4,"Firstly, that's not how works. You have to spe...",neutral
...,...,...
4418,"er, can you explain how and why works?",neutral
4419,I now have this extremely ugly solution. Anybo...,negative
4420,Won't the guilty party simply increase the min...,neutral
4421,I really fail to see the use case... If you wi...,negative


In [64]:
jira

Unnamed: 0,Text,score
0,Committed. Thanks Ning,1
1,Sorry I meant ZOOKEEPER-1239.,-1
2,Hi Ted Matteo Thanks for the review. The co...,1
3,Thanks to both of you and to Deepesh for the ...,1
4,I just committed this. thanks steven!,1
...,...,...
2560,Hi Rupert! Sorry it took so long but i though...,-1
2561,Spark patch applied at revision r440604 than...,1
2562,Thanks Tom!,1
2563,+1. Code looks good.,1


### Replacing the labels
### Positive: 1, Neutral: 0, Negative: -1

In [65]:
so['score'] = so['score'].map({'positive': 1, 'neutral': 0, 'negative': -1})

### Generating the neutral class off the less postive ones using VADER 

In [67]:
# In order to get records of type: neutral, this is done

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment_vader_lexicon(statement):   
    # analyze the compound sentiment for statement
    score = analyzer.polarity_scores(statement)['compound']
    return score

siz = oracle.shape[0]
i = 0
scor = []
while (i<siz):
    score= analyze_sentiment_vader_lexicon(oracle.iloc[i]['Text'])
    scor.append(score)
    i+=1
scor = np.array(scor)

oracle['senti_score'] = scor

In [68]:
oracle['Score'] = oracle.apply(lambda x : 1 if x['senti_score'] >= 0.6 and x['score'] == 0 else x['score'], axis=1)

In [69]:
oracle = oracle.drop(['score', 'senti_score'], axis=1)

In [70]:
oracle.groupby('Score').count()

Unnamed: 0_level_0,Text
Score,Unnamed: 1_level_1
-1,398
0,1064
1,137


In [71]:
oracle

Unnamed: 0,Text,Score
0,- Should be like below:\ntextDirection = SWT.A...,0
1,"""""""create a vdsm.config.config clone, modified...",0
2,"""Add test(s) performing the static code analys...",0
3,"""apt-get"" is distro specific... perhaps make i...",0
4,"""easy"" is marketing; let the code speak for it...",0
...,...,...
1594,you'll need someone with some maven experience...,0
1595,Your memory is too smalll. Consider buying a R...,1
1596,You're preforming this check multiple times.\n...,1
1597,"You're right. Ivan,tenant_id is non-admin tena...",0


In [72]:
oracle.columns = ['Text','score']

### Combining all datasets

In [73]:
df1 = pd.concat([oracle, jira, so])

In [74]:
df1.groupby('score').count()

Unnamed: 0_level_0,Text
score,Unnamed: 1_level_1
-1,2362
0,3460
1,2765


In [75]:
df1

Unnamed: 0,Text,score
0,- Should be like below:\ntextDirection = SWT.A...,0
1,"""""""create a vdsm.config.config clone, modified...",0
2,"""Add test(s) performing the static code analys...",0
3,"""apt-get"" is distro specific... perhaps make i...",0
4,"""easy"" is marketing; let the code speak for it...",0
...,...,...
4418,"er, can you explain how and why works?",0
4419,I now have this extremely ugly solution. Anybo...,-1
4420,Won't the guilty party simply increase the min...,0
4421,I really fail to see the use case... If you wi...,-1


In [76]:
df1 = df1.sample(frac=1).reset_index(drop=True)

### Storing (90-10) train, test datsets for evaluating models

In [77]:
# Storing train, test separately
df_train = df1.sample(frac=0.9)
df_test = df1.loc[~df1.index.isin(df_train.index)]

In [78]:
df_train

Unnamed: 0,Text,score
2520,@Filip - because I was afraid I'd insult your ...,-1
4024,John it would be great!,1
5572,Excellent question.,1
3820,This whole DB is almost entirely read only so ...,1
3025,if the 2 structures variable are initialied wi...,0
...,...,...
6619,I have a gridview with rows that can be edited...,1
4609,> I think it's cleaner. We do not have to re-i...,-1
8052,"Waaaaay cool! I especially like the ""upload fi...",1
7408,Can we inherit singleton class?,0


In [79]:
df_train.groupby('score').count()

Unnamed: 0_level_0,Text
score,Unnamed: 1_level_1
-1,2118
0,3111
1,2499


In [80]:
df_train.to_excel('mod_train.xlsx',index=False,header=None)
df_test.to_csv('mod_test.csv',index=False)