# Preparing a custom annotated dataset for sentiment analysis

In [1]:
import pandas as pd
import numpy as np
import re
from numpy import save, load

### Considered the JIRA and oracle datasets
### Jira from SentiEmoji, oracle from SentiCR

In [2]:
#!pip install xlrd

oracle = pd.read_excel('./oracle.xlsx', names=['Text','score'])
jira = pd.read_csv('Jira.txt', sep="\t", header=None,names=['Text','score'])

In [3]:
oracle

Unnamed: 0,Text,score
0,- Should be like below:\ntextDirection = SWT.A...,0
1,"""""""create a vdsm.config.config clone, modified...",0
2,"""Add test(s) performing the static code analys...",0
3,"""apt-get"" is distro specific... perhaps make i...",0
4,"""easy"" is marketing; let the code speak for it...",0
...,...,...
1594,you'll need someone with some maven experience...,0
1595,Your memory is too smalll. Consider buying a R...,0
1596,You're preforming this check multiple times.\n...,0
1597,"You're right. Ivan,tenant_id is non-admin tena...",0


### Replacing the labels
### Positive: 1, Neutral: 0, Negative: -1

In [4]:
oracle['score'] = oracle['score'].replace([0],1)

### Generating the neutral class off the less postive ones using VADER 

In [5]:
# In order to get records of type: neutral, this is done

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment_vader_lexicon(statement, positive_threshold=0.5,negative_threshold=-0.5):   
    # analyze the compound sentiment for statement
    score = analyzer.polarity_scores(statement)['compound']
    # -1:negative, 0:neutral, 1:positive
    if score >= positive_threshold:
        sentiment = 1
    else:
        sentiment = -1 if score< negative_threshold else 0
    return score,sentiment

siz = oracle.shape[0]
i = 0
pred = []
val = []
while (i<siz):
    score, sentiment = analyze_sentiment_vader_lexicon(oracle.iloc[i]['Text'], positive_threshold=0.3, negative_threshold=-0.3)
    val.append(score)
    pred.append(sentiment)
    i+=1
val = np.array(val)
pred = np.array(pred)

oracle['senti_score'] = pred

In [6]:
oracle['Score'] = oracle.apply(lambda x : 0 if x['senti_score'] == 0 and x['score'] == 1 else x['score'], axis=1)

In [7]:
oracle = oracle.drop(['score', 'senti_score'], axis=1)

In [8]:
oracle = oracle.rename(columns={'Score':'score'}, inplace=True)

### Combining the Oracle & Jira datasets

In [9]:
df1 = pd.concat([oracle, jira])

In [10]:
df1.groupby('score').count()

Unnamed: 0_level_0,Text
score,Unnamed: 1_level_1
-1,760
0,702
1,1103


In [11]:
df1 = df1.sample(frac=1).reset_index(drop=True)

### Storing (80-20) train, test datsets for evaluating models

In [12]:
# Storing train, test separately
df_train = df1.sample(frac=0.8)
df_test = df1.loc[~df1.index.isin(df_train.index)]

In [13]:
df_test

Unnamed: 0,Text,score
3,Sorry it seems I forgot a few references to 2...,-1
7,Awesome speedup! Finally all this work shows g...,1
9,Applied patch into trunk and cxf-2.2.x branch ...,1
16,Added the following to getRollbackOnly(): if ...,0
27,@Ted Thanks for taking a look. Sure I will ma...,1
...,...,...
2544,Marco BTW I have only helped Santosh Malviya ...,1
2550,Thanks for the review jimmy. committed to trunk.,1
2551,bq. Here's a patch for the general case and i...,-1
2554,Thanks to both for these usability enhancements !,1


In [14]:
df_train.to_excel('mod_train.xlsx',index=False,header=None)
df_test.to_csv('mod_test.csv',index=False)

In [15]:
jira

Unnamed: 0,Text,score
0,Committed. Thanks Ning,1
1,Sorry I meant ZOOKEEPER-1239.,-1
2,Hi Ted Matteo Thanks for the review. The co...,1
3,Thanks to both of you and to Deepesh for the ...,1
4,I just committed this. thanks steven!,1
...,...,...
2560,Hi Rupert! Sorry it took so long but i though...,-1
2561,Spark patch applied at revision r440604 than...,1
2562,Thanks Tom!,1
2563,+1. Code looks good.,1
