In [1]:
import numpy as np
import pandas as pd


In [2]:
temp_df = pd.read_csv('https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/refs/heads/master/IMDB-Dataset.csv')
df = temp_df.iloc[:10000]
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df['review'][1]


'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [4]:
df['sentiment'].value_counts()


sentiment
positive    5028
negative    4972
Name: count, dtype: int64

In [5]:
df.isnull().sum()


review       0
sentiment    0
dtype: int64

In [6]:
df.duplicated().sum()


np.int64(17)

In [7]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


np.int64(0)

In [8]:
# Basic preprocessing
# Removal tags
# Lowercase
# Remove stopwords


In [9]:
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text


In [10]:
df['review'] = df['review'].apply(remove_tags)
df.sample(5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(remove_tags)


Unnamed: 0,review,sentiment
3919,This wasn't funny in 1972. It's not funny now....,negative
4628,"As a word of explanation, Disney's ""The Kid"" h...",positive
9611,During my childhood time I have seen the first...,negative
811,"This movie is among my favorite foreign films,...",positive
1735,Komodo vs. Cobra starts as 'One Planet' enviro...,negative


In [11]:
df['review'] = df['review'].apply(lambda x:x.lower())
df.sample(5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(lambda x:x.lower())


Unnamed: 0,review,sentiment
7740,after seeing the film version of heart of dark...,negative
2925,the writer/director of this film obviously doe...,negative
4611,interesting concept that just doesn't make it....,negative
2006,this movie is my all time favorite!!! you real...,positive
6212,a collection of deleted scenes and alternative...,negative


In [13]:
# from nltk.corpus import stopwords
# sw_list = stopwords.words('english')
# df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x: " ".join(x))
# df.sample(5)


from nltk.corpus import stopwords
sw_list = stopwords.words('english')

# Remove stopwords in one go
df['review'] = df['review'].apply(lambda x: " ".join([word for word in x.split() if word.lower() not in sw_list]))

df.sample(5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(lambda x: " ".join([word for word in x.split() if word.lower() not in sw_list]))


Unnamed: 0,review,sentiment
2733,"""pitch black"" complete shock first saw back 20...",positive
1640,"film moments, disappointing eyes anyway. rewor...",negative
6163,pakeezah interesting history (which well docum...,positive
344,"sum, overlong filled subplots swiss cheese hol...",negative
5410,man directed 'the third man' also directed 'wh...,positive


In [14]:
X = df.iloc[:, 0:1]
y = df['sentiment']


In [15]:
X.sample(5)


Unnamed: 0,review
7435,hamlet far favorite shakespeare's works. brana...
370,"movies seventies, none captured truest essence..."
8976,"sort like primitive episode ""general hospital""..."
3611,"several years ago first watched ""grey gardens""..."
678,"adenoid hynkel, lowly soldier world war one, r..."


In [16]:
y.sample(5)


357     negative
3275    negative
3925    negative
3207    negative
3064    positive
Name: sentiment, dtype: object

In [18]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(y)
y


array([1, 1, 1, ..., 0, 0, 1], shape=(9983,))

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [20]:
X_train.shape


(7986, 1)

In [21]:
# applying BoW
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()


In [22]:
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()


In [23]:
X_train_bow.shape


(7986, 48282)

In [24]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow, y_train)


0,1,2
,priors,
,var_smoothing,1e-09


In [25]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test, y_pred)


0.6324486730095142

In [26]:
confusion_matrix(y_test, y_pred)


array([[717, 235],
       [499, 546]])

In [27]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)



0.8507761642463696

In [28]:
cv = CountVectorizer(max_features=3000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)


0.8407611417125689

In [30]:
cv = CountVectorizer(ngram_range=(1,3), max_features=5000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)


0.8492739108662994

#### Using TfIdf

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review'])


In [32]:
rf = RandomForestClassifier()

rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)

accuracy_score(y_test, y_pred)


0.8532799198798198