In [5]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [6]:
temp_df = pd.read_csv('imdb.csv')
df = temp_df.iloc[:1000]
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df.drop_duplicates(inplace=True)

In [8]:
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text
df['review'] = df['review'].apply(remove_tags)

In [10]:
df['review'] = df['review'].apply(lambda x:x.lower())

In [11]:
from nltk.corpus import stopwords

sw_list = stopwords.words('english')

df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [13]:
import gensim

In [14]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [15]:
story = []
for doc in df['review']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))
    

In [16]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [17]:
model.build_vocab(story)

In [18]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(557231, 623495)

In [19]:
len(model.wv.index_to_key)

9395

In [20]:
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)

In [21]:
document_vector(df['review'].values[0])

array([-2.7137515e-01,  3.7848389e-01,  1.5153392e-02,  5.8045555e-03,
       -8.6782342e-03, -4.8016995e-01,  2.1696909e-01,  8.5855222e-01,
       -3.7416050e-01, -1.7431460e-01, -1.6742556e-01, -6.6576308e-01,
       -6.3816965e-02,  1.3379684e-01,  1.3412211e-02, -3.7427422e-01,
        1.7419366e-02, -6.1223054e-01,  8.4908828e-02, -7.8225458e-01,
        2.0347328e-01,  2.1074718e-01,  2.3626739e-01, -1.1706268e-01,
       -8.3594739e-02,  1.5966643e-02, -3.0384028e-01, -1.6134499e-01,
       -2.5672263e-01,  2.5262948e-02,  5.1697099e-01,  1.4523655e-01,
        1.1521604e-01, -2.0305443e-01, -1.0149473e-01,  2.5746194e-01,
        1.6986284e-01, -2.1381244e-01, -1.8322672e-01, -7.1336359e-01,
        3.6819901e-02, -4.3867850e-01, -8.7744735e-02,  5.8983937e-02,
        3.0324599e-01, -2.6688379e-01, -2.3823027e-01, -9.4745107e-02,
        2.9243731e-01,  3.6179048e-01,  3.4733865e-02, -3.6195010e-01,
       -2.6586413e-01, -1.9600119e-01, -2.1553731e-01,  5.3180445e-02,
      

In [22]:
from tqdm import tqdm

In [23]:
X = []
for doc in tqdm(df['review'].values):
    X.append(document_vector(doc))

100%|██████████████████████████████████████| 1000/1000 [00:08<00:00, 120.46it/s]


In [24]:
X = np.array(X)

In [25]:
X[0]

array([-2.7137515e-01,  3.7848389e-01,  1.5153392e-02,  5.8045555e-03,
       -8.6782342e-03, -4.8016995e-01,  2.1696909e-01,  8.5855222e-01,
       -3.7416050e-01, -1.7431460e-01, -1.6742556e-01, -6.6576308e-01,
       -6.3816965e-02,  1.3379684e-01,  1.3412211e-02, -3.7427422e-01,
        1.7419366e-02, -6.1223054e-01,  8.4908828e-02, -7.8225458e-01,
        2.0347328e-01,  2.1074718e-01,  2.3626739e-01, -1.1706268e-01,
       -8.3594739e-02,  1.5966643e-02, -3.0384028e-01, -1.6134499e-01,
       -2.5672263e-01,  2.5262948e-02,  5.1697099e-01,  1.4523655e-01,
        1.1521604e-01, -2.0305443e-01, -1.0149473e-01,  2.5746194e-01,
        1.6986284e-01, -2.1381244e-01, -1.8322672e-01, -7.1336359e-01,
        3.6819901e-02, -4.3867850e-01, -8.7744735e-02,  5.8983937e-02,
        3.0324599e-01, -2.6688379e-01, -2.3823027e-01, -9.4745107e-02,
        2.9243731e-01,  3.6179048e-01,  3.4733865e-02, -3.6195010e-01,
       -2.6586413e-01, -1.9600119e-01, -2.1553731e-01,  5.3180445e-02,
      

In [26]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

y = encoder.fit_transform(df['sentiment'])

In [27]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.615