# lecture 19 - sentiment analysis

In [36]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

In [37]:
sid = SentimentIntensityAnalyzer()

In [38]:
a = 'This is a good movie'

In [39]:
sid.polarity_scores(a) # negative - neutral - positive 

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [40]:
a = 'This was the best, most awesome movie EVER MADE!!!' # vader should also be able to catch !!!

In [41]:
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.425, 'pos': 0.575, 'compound': 0.8877}

In [42]:
a = 'This was the WORST movie that has ever disgraced the screen'

In [43]:
sid.polarity_scores(a)

{'neg': 0.465, 'neu': 0.535, 'pos': 0.0, 'compound': -0.8331}

In [44]:
df = pd.read_csv('./datasets_files/amazonreviews.tsv',sep='\t')

In [45]:
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [46]:
df['label'].value_counts() # quite balanced dataset

label
neg    5097
pos    4903
Name: count, dtype: int64

In [47]:
df.dropna(inplace=True)

In [48]:
blanks = []

for i,lb,rv in df.itertuples(): # for index label and review
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)

In [49]:
blanks # no empty review

[]

In [50]:
sid.polarity_scores(df.iloc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [51]:
df['scores'] = df['review'].apply(lambda review:sid.polarity_scores(review))

In [52]:
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [53]:
df['compound'] = df['scores'].apply(lambda d:d['compound'])

In [54]:
df.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [55]:
df['compound_score'] = df['compound'].apply(lambda score: 'pos' if score >= 0 else 'neg')

In [56]:
df.head()

Unnamed: 0,label,review,scores,compound,compound_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


In [57]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [58]:
accuracy_score(df['label'], df['compound_score']) # we are dong better than random guessing

0.7098

In [59]:
print(classification_report(df['label'], df['compound_score'])) # more trouble with positive reviews --> sarcasm more difficut to detect

              precision    recall  f1-score   support

         neg       0.86      0.52      0.64      5097
         pos       0.64      0.91      0.75      4903

    accuracy                           0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [60]:
print(confusion_matrix(df['label'], df['compound_score']))

[[2630 2467]
 [ 435 4468]]
