# IMDB Data Vectorization

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
import gensim

In [22]:
# df = pd.read_csv('cleaned_stemmed_IMDb_reviews.csv')
df = pd.read_csv('cleaned_non_stemmed_IMDb_reviews.csv')
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode yo...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love time money visually stunni...,1


#### 1. Tokenization of our reviews

In [23]:
# tokenizing reviews
tokenized_review = df['review'].apply(lambda x: x.split())

#### 3. Training a Word2Vec model

In [24]:
# SETTING MODEL PARAMETERS
model_w2v = gensim.models.Word2Vec(
    tokenized_review,
    vector_size = 1000, # desired number of features
    window = 5, # context window
    min_count = 2, # ignore all words with total frequency lower than 2
    sg = 1,
    hs = 0,
    negative = 10,
    workers = 10,
    seed = 34
)

In [25]:
# TRAINING MODEL ON CORPUS
model_w2v.train(
    tokenized_review,
    total_examples = len(df['review']),
    epochs = 20
)

(110898761, 117768600)

#### 4. Transforming our tokenized features into vectors 

In [26]:
# FUNC: CREATES VECTOR FOR EACH REVIEW BY TAKING THE AVERAGE OF THE VECTORS OF WORDS PRESENT IN REVIEW
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += model_w2v.wv[word].reshape((1, size))
            count += 1
        except KeyError:    # case where token is not in vocab
            continue
    if count !=0:
        vec /= count
    return vec

In [27]:
# applying above method to each review
wordvec_arrays = np.zeros((len(tokenized_review), 1000))
for i in range(len(tokenized_review)):
    wordvec_arrays[i,:] = word_vector(tokenized_review[i], 1000)

wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape

(49582, 1000)

#### 5. Rejoining our vectors alongside our target feature and saving to .CSV file to be used later for modeling development

In [28]:
wordvec_df['sentiment'] = df['sentiment']
wordvec_df.shape

(49582, 1001)

In [29]:
wordvec_df.to_csv('IMDb_nonstemmed_w2v_1000v_data.csv', index=False)
# wordvec_df.to_csv('IMDb_nonstemmed_w2v_500v_data.csv', index=False)
# wordvec_df.to_csv('IMDb_stemmed_w2v_data.csv', index=False)
# wordvec_df.to_csv('IMDb_nonstemmed_vectorized_data.csv', index=False)