In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from textblob import TextBlob

<br>
<br>
<br>

### Data Collection

In [3]:
# loading csv data
data = pd.read_csv("../data/data.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# checking shape
data.shape

(50000, 2)

In [5]:
# removing duplicate values
data = data.drop_duplicates().reset_index()
data.shape

(49582, 3)

<br>
<br>
<br>

### Data Preparation

In [6]:
# seperating reviews
X = data.review
X.head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [7]:
# seperating target
y = data.sentiment
y.head()

0    positive
1    positive
2    positive
3    negative
4    positive
Name: sentiment, dtype: object

In [8]:
# removes html tags from the text

cleaner_regex = re.compile('<.*?>')

def remove_tags(text):
    cleantext = re.sub(cleaner_regex, '', text)
    return cleantext

In [9]:
# cleaning reviews
x_cleaned = X.apply(lambda x: remove_tags(x))
x_cleaned.head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. The filming tec...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [10]:
# verifying shape
print(x_cleaned.shape)
print(y.shape)

(49582,)
(49582,)


<br>
<br>
<br>

### TextBlob

In [11]:
text = "It was not that great movie."

# sentiment score
TextBlob(text).sentiment

Sentiment(polarity=0.8, subjectivity=0.75)

In [12]:
# making predictions and appending data
# to respective lists

texts = []
actual_sentiments = []
predicted_sentiments = []
scores = []

for i, review in enumerate(x_cleaned):
    texts.append(review)
    actual_sentiments.append(y[i])
    score = TextBlob(review).sentiment.polarity
    predicted_sentiments.append('positive' if score > 0 else 'negative')
    scores.append(score)

In [13]:
# creating dictionary
sentiment_data = {
    'text': texts,
    'actual': actual_sentiments,
    'predicted': predicted_sentiments,
    'score': scores,
}

# creating dataframe
sentiment_df = pd.DataFrame(sentiment_data)
sentiment_df.head()

Unnamed: 0,text,actual,predicted,score
0,One of the other reviewers has mentioned that ...,positive,positive,0.023433
1,A wonderful little production. The filming tec...,positive,positive,0.109722
2,I thought this was a wonderful way to spend ti...,positive,positive,0.354008
3,Basically there's a family where a little boy ...,negative,negative,-0.057813
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,positive,0.217952


<br>

In [14]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [15]:
# prediction accuracy
accuracy_score(sentiment_df.actual, sentiment_df.predicted)

0.6897059416723811

In [16]:
# confusion matrix
confusion_matrix(sentiment_df.actual, sentiment_df.predicted)

array([[10625, 14073],
       [ 1312, 23572]], dtype=int64)

In [17]:
# classification report
print(classification_report(sentiment_df.actual, sentiment_df.predicted))

              precision    recall  f1-score   support

    negative       0.89      0.43      0.58     24698
    positive       0.63      0.95      0.75     24884

    accuracy                           0.69     49582
   macro avg       0.76      0.69      0.67     49582
weighted avg       0.76      0.69      0.67     49582

