## Install & Load Gensim Library

In [0]:
!pip install gensim --quiet

In [0]:
import pandas as pd
import re, string
import gensim
import logging

### Load Moview reviews Text Data

Download data from Kaggle -> https://www.kaggle.com/c/word2vec-nlp-tutorial/data.

Filename: unlabeledTrainData.tsv.zip

In [0]:
df = pd.read_csv('kaggle/unlabeledTrainData.tsv.zip', header=0, delimiter="\t", quoting=3)

In [0]:
print(df.shape)
df.head()

## Function to Clean up data

In [0]:
def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)	
  except:
    return ""

Clean the Data using routine above

In [0]:
df['clean_review'] = df['review'].apply(clean_str)
df.head()

## Convert Each Review to a Word List
This is required for gensim Word2Vec

In [0]:
documents = []

for doc in df['clean_review']:
    documents.append(doc.split(' '))

## Build the Model

In [0]:
#Logging for training
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=10, #Ignore all words with total frequency lower than this                           
                               workers=4, #Number of CPUs
                               size=50,  #Embedding size
                               window=5, #Maximum Distance between current and predicted word
                               iter=10   #Number of iterations over the text corpus
                              )  

## Exploring the model

How many words in the model and how many features

In [0]:
model.wv.vectors.shape

In [0]:
len(model.wv.vocab)

Get an embedding for a word

In [0]:
model.wv['flower']

Saving the model

In [0]:
model.save('word2vec-movie-50')

Finding Words which have similar meaning

In [0]:
model.wv.most_similar('great')

Find the Word which is not like others

In [0]:
model.wv.doesnt_match("man woman child kitchen".split())

1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [0]:
model.wv.most_similar(positive=['king','man'], negative=['queen'])

Loading a model from Memory

In [0]:
model = gensim.models.Word2Vec.load('word2vec-movie-50')