In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("IMDB Dataset.csv")
df = df.iloc[:10000]
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [4]:
df.duplicated().sum()

17

In [5]:
df.drop_duplicates(inplace=True)

# Remove HTML Tags:-

In [6]:
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

In [7]:
df['review'] = df['review'].apply(remove_tags)

# Text Convert into Lowercase:-

In [8]:
df['review'] = df['review'].apply(lambda x:x.lower())

# Remove StopWords:-

In [9]:
from nltk.corpus import stopwords

sw_list = stopwords.words('english')

df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [12]:
import gensim

In [13]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [14]:
story = []
for doc in df['review']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [15]:
model = gensim.models.Word2Vec(window=10,min_count=2)

In [16]:
model.build_vocab(story)

In [17]:
model.train(story,total_examples=model.corpus_count,epochs=model.epochs)

(5875985, 6212140)

In [18]:
model.wv.index_to_key

['movie',
 'film',
 'one',
 'like',
 'good',
 'it',
 'the',
 'would',
 'time',
 'even',
 'story',
 'see',
 'really',
 'well',
 'much',
 'get',
 'bad',
 'great',
 'people',
 'first',
 'also',
 'made',
 'make',
 'way',
 'movies',
 'could',
 'think',
 'characters',
 'watch',
 'character',
 'films',
 'that',
 'never',
 'little',
 'show',
 'seen',
 'many',
 'two',
 'love',
 'acting',
 'plot',
 'best',
 'know',
 'life',
 'this',
 'ever',
 'better',
 'man',
 'there',
 'still',
 'say',
 'scene',
 'end',
 'and',
 'scenes',
 'something',
 'go',
 'real',
 'back',
 'watching',
 'director',
 'actors',
 'years',
 'thing',
 'though',
 've',
 'work',
 'look',
 'funny',
 'actually',
 'old',
 'nothing',
 'going',
 'makes',
 'new',
 'lot',
 'another',
 'all',
 'every',
 'find',
 'pretty',
 'things',
 'part',
 'can',
 'he',
 'us',
 'world',
 'horror',
 'around',
 'want',
 'big',
 'quite',
 'cast',
 'long',
 'young',
 'enough',
 'in',
 'take',
 'seems',
 'got',
 'must',
 'however',
 'may',
 'thought',
 'fa

In [19]:
len(model.wv.index_to_key)

31845

In [20]:
def document_vector(doc):
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc],axis=0)

In [21]:
document_vector(df['review'].values[0])

array([-0.16961077,  0.4577306 ,  0.15333526,  0.23042832, -0.14526956,
       -0.6370377 ,  0.21487983,  0.9427101 , -0.32777065, -0.24602994,
       -0.29561925, -0.49889117,  0.13082133,  0.08400876,  0.16928372,
       -0.11607364,  0.01605165, -0.34554517, -0.08024585, -0.64193785,
        0.05871066,  0.23200561,  0.09224351, -0.2788152 , -0.3246314 ,
       -0.01078706, -0.30696884, -0.0049577 , -0.34963292,  0.01435835,
        0.36293158,  0.01074389,  0.2070808 , -0.28187332, -0.12882172,
        0.3869257 ,  0.12086947, -0.4177656 , -0.24626955, -0.76985496,
        0.09263428, -0.25871637,  0.08896044, -0.08282261,  0.47743848,
       -0.17503086, -0.2788647 , -0.01997631,  0.12175708,  0.38509312,
        0.10997362, -0.35939282, -0.41774952, -0.08950749, -0.13439927,
        0.23454767,  0.24838828,  0.0209443 , -0.29565525,  0.10720801,
        0.09068353,  0.09571907,  0.02908073, -0.06596141, -0.42153677,
        0.25989717,  0.06803051,  0.07965867, -0.30265322,  0.31

In [22]:
from tqdm import tqdm

In [23]:
X = []
for doc in tqdm(df['review'].values):
    X.append(document_vector(doc))

100%|██████████████████████████████████████████████████████████████████████████████| 9983/9983 [12:27<00:00, 13.35it/s]


In [25]:
X = np.array(X)

In [26]:
X.shape

(9983, 100)

In [30]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(df['review'])
y

array([6502, 9710, 8836, ..., 5416, 5683, 8209])

In [35]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [36]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix
gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred = gnb.predict(X_test)
accuracy_score(y_test,y_pred)

0.0

In [37]:
confusion_matrix(y_test,y_pred)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [38]:
from sklearn.ensemble import RandomForestClassifier

In [39]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

MemoryError: could not allocate 1046740992 bytes