In [1]:
import numpy as np
import pandas as pd


In [3]:
temp_df = pd.read_csv('https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/refs/heads/master/IMDB-Dataset.csv')
df = temp_df.iloc[:10000]
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df['review'][1]


'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [5]:
df['sentiment'].value_counts()


sentiment
positive    5028
negative    4972
Name: count, dtype: int64

In [6]:
df.isnull().sum()


review       0
sentiment    0
dtype: int64

In [7]:
df.duplicated().sum()


np.int64(17)

In [8]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


np.int64(0)

In [9]:
# Basic preprocessing
# Removal tags
# Lowercase
# Remove stopwords


In [10]:
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text


In [11]:
df['review'] = df['review'].apply(remove_tags)
df.sample(5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(remove_tags)


Unnamed: 0,review,sentiment
6816,some people think that the second series was w...,positive
3877,Chuck Jones's 'Hare Conditioned' is a fast pac...,positive
2344,I heard this movie was badThey even warned me...,negative
6621,Let's put political correctness aside and just...,negative
6220,This is a great movie. Some will disagree with...,positive


In [12]:
df['review'] = df['review'].apply(lambda x:x.lower())
df.sample(5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(lambda x:x.lower())


Unnamed: 0,review,sentiment
9204,i couldn't' agree more than with the comment l...,positive
1893,if you can watch a bond film from 1983 that is...,positive
7595,cult of the cobra is now available on dvd in a...,negative
290,"i saw the movie ""hoot"" and then i immediately ...",positive
3794,in case you're a self-acclaimed connoisseur of...,negative


In [14]:
from nltk.corpus import stopwords
sw_list = stopwords.words('english')

# Remove stopwords in one go
df['review'] = df['review'].apply(lambda x: " ".join([word for word in x.split() if word.lower() not in sw_list]))

df.sample(5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(lambda x: " ".join([word for word in x.split() if word.lower() not in sw_list]))


Unnamed: 0,review,sentiment
6505,film made saskatchewan manitoba parks returned...,negative
6015,"odd thing galaxina supremely bad, although is....",negative
3858,"watch open mind, different, nothing's cutesy t...",positive
1766,"film absolutely brilliant, buzz, rush makes wa...",positive
9806,came across movie channel surfing one day; dec...,positive


In [21]:
import gensim

from nltk import sent_tokenize
from gensim.utils import simple_preprocess


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
story = []
for doc in df['review']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))


In [None]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)
model.build_vocab(story)


In [None]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)
len(model.wv.index_to_key)


In [None]:
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)


In [None]:
document_vector(df['review'].values[0])


In [None]:
from tqdm import tqdm


In [None]:
X= []

for doc in tqdm(df['review'].values):
    X.append(document_vector(doc))

X = np.array(X)
X.shape


In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

y = encoder.fit_transform(df['sentiment'])
y


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

mnb = GaussianNB()
mnb.fit(X_train, y_train)

y_pred = mnb.predict(X_test)
accuracy_score(y_test, y_pred)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)


In [None]:
X = df.iloc[:, 0:1]
y = df['sentiment']


In [None]:
X.sample(5)


In [None]:
y.sample(5)


In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(y)
y


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [None]:
X_train.shape


In [None]:
# applying BoW
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()


In [None]:
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()


In [None]:
X_train_bow.shape


In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow, y_train)


In [None]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test, y_pred)


In [None]:
confusion_matrix(y_test, y_pred)


In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)



In [None]:
cv = CountVectorizer(max_features=3000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)


In [None]:
cv = CountVectorizer(ngram_range=(1,3), max_features=5000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fir(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)


#### Using TfIdf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review'])


In [None]:
rf = RandomForestClassifier()

rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)

accuracy_score(y_test, y_pred)
