In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

<br>
<br>
<br>

### Data Collection

In [3]:
# loading csv data
data = pd.read_csv("../data/data.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# checking shape
data.shape

(50000, 2)

In [5]:
# dropping duplicate values
data = data.drop_duplicates().reset_index()
data.shape

(49582, 3)

<br>
<br>
<br>

### Data Preparation

In [6]:
# seperating reviews
X = data.review
X.head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [7]:
# seperating target
y = data.sentiment
y.head()

0    positive
1    positive
2    positive
3    negative
4    positive
Name: sentiment, dtype: object

In [8]:
# removes html tags from the text

cleaner_regex = re.compile('<.*?>')

def remove_tags(text):
    cleantext = re.sub(cleaner_regex, '', text)
    return cleantext

In [9]:
# cleaning X
x_cleaned = X.apply(lambda x: remove_tags(x))
x_cleaned.head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. The filming tec...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [10]:
# verifying shape
print(x_cleaned.shape)
print(y.shape)

(49582,)
(49582,)


<br>
<br>
<br>

### VADER

In [11]:
# sentiment intensity analyzer
sid = SentimentIntensityAnalyzer()

In [12]:
text = "It was a not that great movie."

# polarity score
sid.polarity_scores(text)

{'neg': 0.397, 'neu': 0.603, 'pos': 0.0, 'compound': -0.5096}

In [13]:
# review text
texts = []
# actual value
actual_sentiments = []
# predicted values
predicted_sentiments = []
# prediction score
scores = []

# making predictions and appending
for i, review in enumerate(x_cleaned):
    texts.append(review)
    actual_sentiments.append(y[i])
    score = sid.polarity_scores(review)['compound']
    predicted_sentiments.append('positive' if score > 0 else 'negative')
    scores.append(score)

In [14]:
# creating dictionary
sentiment_data = {
    'text': texts,
    'actual': actual_sentiments,
    'predicted': predicted_sentiments,
    'score': scores,
}

# converting to dataframe
sentiment_df = pd.DataFrame(sentiment_data)
sentiment_df.head()

Unnamed: 0,text,actual,predicted,score
0,One of the other reviewers has mentioned that ...,positive,negative,-0.9951
1,A wonderful little production. The filming tec...,positive,positive,0.9641
2,I thought this was a wonderful way to spend ti...,positive,positive,0.9605
3,Basically there's a family where a little boy ...,negative,negative,-0.9213
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,positive,0.9744


<br>

In [15]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [16]:
# prediction accuracy
accuracy_score(sentiment_df.actual, sentiment_df.predicted)

0.6969464725101852

In [17]:
# confusion matrix
confusion_matrix(sentiment_df.actual, sentiment_df.predicted)

array([[13273, 11425],
       [ 3601, 21283]], dtype=int64)

In [18]:
# classification report
print(classification_report(sentiment_df.actual, sentiment_df.predicted))

              precision    recall  f1-score   support

    negative       0.79      0.54      0.64     24698
    positive       0.65      0.86      0.74     24884

    accuracy                           0.70     49582
   macro avg       0.72      0.70      0.69     49582
weighted avg       0.72      0.70      0.69     49582

