In [35]:
import numpy as np
import pandas as pd

In [36]:
df = pd.read_csv('IMDB Dataset.csv')

In [37]:
df = df.iloc[:10000]

In [38]:
df['review']

0       One of the other reviewers has mentioned that ...
1       A wonderful little production. <br /><br />The...
2       I thought this was a wonderful way to spend ti...
3       Basically there's a family where a little boy ...
4       Petter Mattei's "Love in the Time of Money" is...
                              ...                        
9995    Fun, entertaining movie about WWII German spy ...
9996    Give me a break. How can anyone say that this ...
9997    This movie is a bad movie. But after watching ...
9998    This is a movie that was probably made to ente...
9999    Smashing film about film-making. Shows the int...
Name: review, Length: 10000, dtype: object

In [39]:
df['sentiment'].value_counts()

sentiment
positive    5028
negative    4972
Name: count, dtype: int64

In [40]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [41]:
df.duplicated().sum()

17

In [42]:
df.drop_duplicates(inplace=True)

In [43]:
df.duplicated().sum()

0

In [44]:
# Basic Preprocessing
# Remove tags
# lowercase
# remove stopwords

In [45]:
import re
def remove_tags(raw_txt):
    clean_txt = re.sub(re.compile('<.*?>'), '', raw_txt)
    return clean_txt

In [None]:
df['review'] = df['review'].apply(remove_tags)

In [None]:
df

In [46]:
df['review'] = df['review'].apply(lambda x:x.lower())

In [47]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')

df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in stopwords_list]).apply(lambda x:" ".join(x))

In [48]:
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. <br /><br />the f...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive
...,...,...
9995,"fun, entertaining movie wwii german spy (julie...",positive
9996,"give break. anyone say ""good hockey movie""? kn...",negative
9997,movie bad movie. watching endless series bad h...,negative
9998,"movie probably made entertain middle school, e...",negative


In [49]:
import string

In [50]:
exclude = string.punctuation

In [51]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char, '')
    return text

In [52]:
df['review'] = df['review'].apply(remove_punc)

In [53]:
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production br br the filming ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive
...,...,...
9995,fun entertaining movie wwii german spy julie a...,positive
9996,give break anyone say good hockey movie know m...,negative
9997,movie bad movie watching endless series bad ho...,negative
9998,movie probably made entertain middle school ea...,negative


In [54]:
X = df.iloc[:,0:1]
y = df['sentiment']

In [55]:
X

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production br br the filming ...
2,thought wonderful way spend time hot summer we...
3,basically theres family little boy jake thinks...
4,petter matteis love time money visually stunni...
...,...
9995,fun entertaining movie wwii german spy julie a...
9996,give break anyone say good hockey movie know m...
9997,movie bad movie watching endless series bad ho...
9998,movie probably made entertain middle school ea...


In [56]:
y

0       positive
1       positive
2       positive
3       negative
4       positive
          ...   
9995    positive
9996    negative
9997    negative
9998    negative
9999    positive
Name: sentiment, Length: 9983, dtype: object

In [57]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(y)

In [58]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [60]:
X_train

Unnamed: 0,review
6713,ive waiting superhero movie like long time mys...
1178,movie excellent acted excellent directed overa...
4707,movie makes want throw every time see it take ...
6772,first saw movie elementary school back 1960s f...
7461,show made persons iq lower 80 jokes show lame ...
...,...
2895,excellent episode movie ala pulp fiction 7 day...
7823,first off give idea taste moviesbr br 2007 com...
905,well begin story went movie tonight friends kn...
5195,lot horror fans seem love scarecrows popular s...


In [61]:
X_test

Unnamed: 0,review
5333,8 simple rules dating teenage daughter auspici...
4113,one imdb reviewer puts it imagine 2001 space o...
6853,although better first mulva which say much any...
3219,film worst film ranks high me slasher movie be...
7399,astounding film well showing actual footage ke...
...,...
8380,would like know john amos left show die show a...
3556,rented watched short 90 minutes work far best...
1041,mukhsin beautiful movie first love story every...
8913,ive seen bad things time half dead cow trying ...


In [62]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

In [63]:
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [64]:
X_train_bow.shape

(7986, 66484)

In [65]:
X_test_bow.shape

(1997, 66484)

In [66]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow, y_train)

In [67]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test, y_pred)

0.6524787180771157

In [68]:
confusion_matrix(y_test, y_pred)

array([[704, 248],
       [446, 599]], dtype=int64)

In [69]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)

0.8352528793189785

In [70]:
cv = CountVectorizer(max_features=3000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)

0.8302453680520782

In [71]:
# N-Gram Implementation
cv = CountVectorizer(ngram_range=(1,2),max_features=5000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.8442663995993991

# Using TfIdf

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [73]:
tfidf = TfidfVectorizer()

In [74]:
X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review'])

In [75]:
rf = RandomForestClassifier()

rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)

accuracy_score(y_test, y_pred)

0.8392588883324987

# Word2Vec

In [83]:
import gensim
from gensim.models import Word2Vec, KeyedVectors
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [84]:
story = []
for doc in df['review']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [85]:
model = gensim.models.Word2Vec(window=10, min_count=2)

In [86]:
model.build_vocab(story)

In [87]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(5743566, 6216150)

In [88]:
len(model.wv.index_to_key)

35981

In [89]:
def document_vector(doc):
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)

In [92]:
document_vector(df['review'].values[0])

array([-0.14084238,  0.22544211,  0.06800063,  0.21459667,  0.09503114,
       -0.6450723 ,  0.06180874,  0.7705588 , -0.21039595, -0.39398417,
       -0.23632626, -0.62248814, -0.04780537,  0.20186341,  0.02385439,
       -0.20673046,  0.01770677, -0.41424966, -0.06806085, -0.65587074,
        0.0965366 ,  0.13707645,  0.24433677, -0.26981416,  0.03816686,
       -0.03467187, -0.35821983, -0.14168276, -0.41725212,  0.0179071 ,
        0.42094636, -0.12871344,  0.13953413, -0.4739672 , -0.20563439,
        0.2877714 , -0.00975097, -0.3517653 , -0.382106  , -0.5811255 ,
        0.2403411 , -0.14124401, -0.25453374,  0.16399448,  0.23230319,
       -0.1569626 , -0.42859268, -0.09577966,  0.3035948 ,  0.2841447 ,
        0.19677188, -0.37238157,  0.00455716,  0.11904944, -0.250912  ,
        0.15798244,  0.0445173 , -0.01889776, -0.09858821,  0.2880864 ,
        0.15234247,  0.09001806, -0.0272735 ,  0.04646041, -0.43721566,
        0.4897885 ,  0.04418515,  0.34086585, -0.5654931 ,  0.55

In [93]:
from tqdm import tqdm

In [94]:
X = []
for doc in tqdm(df['review'].values):
    X.append(document_vector(doc))

100%|██████████████████████████████████████████████████████████████████████████████| 9983/9983 [01:43<00:00, 96.84it/s]


In [95]:
X = np.array(X)

In [96]:
X[0]

array([-0.14084238,  0.22544211,  0.06800063,  0.21459667,  0.09503114,
       -0.6450723 ,  0.06180874,  0.7705588 , -0.21039595, -0.39398417,
       -0.23632626, -0.62248814, -0.04780537,  0.20186341,  0.02385439,
       -0.20673046,  0.01770677, -0.41424966, -0.06806085, -0.65587074,
        0.0965366 ,  0.13707645,  0.24433677, -0.26981416,  0.03816686,
       -0.03467187, -0.35821983, -0.14168276, -0.41725212,  0.0179071 ,
        0.42094636, -0.12871344,  0.13953413, -0.4739672 , -0.20563439,
        0.2877714 , -0.00975097, -0.3517653 , -0.382106  , -0.5811255 ,
        0.2403411 , -0.14124401, -0.25453374,  0.16399448,  0.23230319,
       -0.1569626 , -0.42859268, -0.09577966,  0.3035948 ,  0.2841447 ,
        0.19677188, -0.37238157,  0.00455716,  0.11904944, -0.250912  ,
        0.15798244,  0.0445173 , -0.01889776, -0.09858821,  0.2880864 ,
        0.15234247,  0.09001806, -0.0272735 ,  0.04646041, -0.43721566,
        0.4897885 ,  0.04418515,  0.34086585, -0.5654931 ,  0.55

In [97]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

y = encoder.fit_transform(df['sentiment'])

In [98]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [99]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [100]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [101]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7991987981972959