In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("../data/IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df['review'].shape

(50000,)

In [3]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,y_test=train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=52)

In [5]:
#TFIDF Vectorization

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(max_features=5000)
x_train_tfidf=tfidf.fit_transform(x_train)
x_test_tfidf=tfidf.transform(x_test)

In [6]:
x_train_tfidf.shape

(40000, 5000)

In [9]:
# Train classifier
from sklearn.linear_model import LogisticRegression

clf=LogisticRegression()
clf.fit(x_train_tfidf, y_train)

In [12]:
# Evaluate
y_pred=clf.predict(x_test_tfidf)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      4962
    positive       0.88      0.91      0.89      5038

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [13]:
# the accuracy is only 89% here, so let's improve this by using stopwords removal and symbols removal from the reviews above

In [22]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import re


nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

def clean_review(text):
    if pd.isnull(text):
        return ""
    # remove html tags
    text=BeautifulSoup(text, "html.parser").get_text()

    # remove non alphanumeric characters and emojis
    text=re.sub(r'[^a-zA-Z\s]','',text)

    #lowercase
    text=text.lower()

    #remove stopwords
    # text=" ".join([word for word in text.split() if word not in stop_words])

    return text


df['cleaned_reviews']=df['review'].apply(clean_review)



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abhishek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  text=BeautifulSoup(text, "html.parser").get_text()


In [23]:
df.head()

Unnamed: 0,review,sentiment,cleaned_reviews
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production the filming tech...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,basically theres a family where a little boy j...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love in the time of money is a ...


In [24]:
x_train, x_test, y_train,y_test=train_test_split(df['cleaned_reviews'], df['sentiment'], test_size=0.2, random_state=52)

In [25]:
tfidf=TfidfVectorizer(max_features=5000)
x_train_tfidf=tfidf.fit_transform(x_train)
x_test_tfidf=tfidf.transform(x_test)

In [26]:
clf=LogisticRegression()
clf.fit(x_train_tfidf, y_train)

In [27]:
# Evaluate
y_pred=clf.predict(x_test_tfidf)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      4962
    positive       0.88      0.91      0.89      5038

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [29]:
# Exploring the vocab size

tfidf=TfidfVectorizer()
X = tfidf.fit_transform(df['cleaned_reviews'])

# Vocabulary size
vocab_size = len(tfidf.vocabulary_)
print("Total unique tokens in the corpus:", vocab_size)

Total unique tokens in the corpus: 214597


In [30]:
# so the vocab size is huge, that's why let's increase vocab size to 20000 is tfidfvectorizer
tfidf=TfidfVectorizer(max_features=214597)
x_train_tfidf=tfidf.fit_transform(x_train)
x_test_tfidf=tfidf.transform(x_test)

clf=LogisticRegression()
clf.fit(x_train_tfidf, y_train)

# Evaluate
y_pred=clf.predict(x_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.91      0.88      0.89      4962
    positive       0.88      0.91      0.90      5038

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

