# Load Gensim Library

In [1]:
import pandas as pd
import re, string
import gensim
import logging



In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Load Text Data

In [3]:
df = pd.read_csv('unlabeledTrainData.tsv.zip', header=0, delimiter="\t", quoting=3)

In [4]:
df.shape

(50000, 2)

In [5]:
df.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


# Function to Clean up data

In [6]:
def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)	
  except:
    return ""

Clean the Data using routine above

In [7]:
df['clean_review'] = df['review'].apply(clean_str)

In [8]:
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


# Convert Each Review to a Word List before feeding to Word2Vec

In [9]:
documents = []

In [10]:
for doc in df['clean_review']:
    documents.append(doc.split(' '))

In [13]:
documents[0]

['watching',
 'time',
 'chasers',
 'it',
 'obvious',
 'that',
 'it',
 'was',
 'made',
 'by',
 'a',
 'bunch',
 'of',
 'friends',
 'maybe',
 'they',
 'were',
 'sitting',
 'around',
 'one',
 'day',
 'in',
 'film',
 'school',
 'and',
 'said',
 'hey',
 'let',
 's',
 'pool',
 'our',
 'money',
 'together',
 'and',
 'make',
 'a',
 'really',
 'bad',
 'movie',
 'or',
 'something',
 'like',
 'that',
 'what',
 'ever',
 'they',
 'said',
 'they',
 'still',
 'ended',
 'up',
 'making',
 'a',
 'really',
 'bad',
 'movie',
 'dull',
 'story',
 'bad',
 'script',
 'lame',
 'acting',
 'poor',
 'cinematography',
 'bottom',
 'of',
 'the',
 'barrel',
 'stock',
 'music',
 'etc',
 'all',
 'corners',
 'were',
 'cut',
 'except',
 'the',
 'one',
 'that',
 'would',
 'have',
 'prevented',
 'this',
 'film',
 's',
 'release',
 'life',
 's',
 'like',
 'that']

In [13]:
len(documents)

50000

# Build the Model

In [14]:
#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=10, #Ignore all words with total frequency lower than this                           
                               workers=4, #Number of CPUs
                               size=50,  #Embedding size
                               window=5, #Maximum Distance between current and predicted word
                               iter=10   #Number of iterations over the text corpus
                              )  

2018-05-04 14:19:45,689 : INFO : collecting all words and their counts
2018-05-04 14:19:45,694 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-05-04 14:19:46,741 : INFO : PROGRESS: at sentence #10000, processed 2399440 words, keeping 51654 word types
2018-05-04 14:19:47,749 : INFO : PROGRESS: at sentence #20000, processed 4835846 words, keeping 69077 word types
2018-05-04 14:19:48,959 : INFO : PROGRESS: at sentence #30000, processed 7267977 words, keeping 81515 word types
2018-05-04 14:19:49,951 : INFO : PROGRESS: at sentence #40000, processed 9669772 words, keeping 91685 word types
2018-05-04 14:19:50,962 : INFO : collected 100479 word types from a corpus of 12084660 raw words and 50000 sentences
2018-05-04 14:19:50,962 : INFO : Loading a fresh vocabulary
2018-05-04 14:19:51,817 : INFO : min_count=10 retains 28322 unique words (28% of original 100479, drops 72157)
2018-05-04 14:19:51,822 : INFO : min_count=10 leaves 11910457 word corpus (98% of original

2018-05-04 14:20:46,210 : INFO : EPOCH 3 - PROGRESS: at 3.29% examples, 280629 words/s, in_qsize 8, out_qsize 0
2018-05-04 14:20:47,243 : INFO : EPOCH 3 - PROGRESS: at 7.27% examples, 306951 words/s, in_qsize 7, out_qsize 0
2018-05-04 14:20:48,281 : INFO : EPOCH 3 - PROGRESS: at 11.06% examples, 310908 words/s, in_qsize 6, out_qsize 1
2018-05-04 14:20:49,294 : INFO : EPOCH 3 - PROGRESS: at 15.20% examples, 321482 words/s, in_qsize 7, out_qsize 0
2018-05-04 14:20:50,327 : INFO : EPOCH 3 - PROGRESS: at 19.28% examples, 326348 words/s, in_qsize 8, out_qsize 1
2018-05-04 14:20:51,355 : INFO : EPOCH 3 - PROGRESS: at 23.29% examples, 328767 words/s, in_qsize 8, out_qsize 1
2018-05-04 14:20:52,376 : INFO : EPOCH 3 - PROGRESS: at 27.33% examples, 332036 words/s, in_qsize 6, out_qsize 1
2018-05-04 14:20:53,388 : INFO : EPOCH 3 - PROGRESS: at 31.36% examples, 334649 words/s, in_qsize 6, out_qsize 1
2018-05-04 14:20:54,400 : INFO : EPOCH 3 - PROGRESS: at 35.32% examples, 335968 words/s, in_qsize 

2018-05-04 14:21:52,608 : INFO : EPOCH 5 - PROGRESS: at 36.24% examples, 284616 words/s, in_qsize 8, out_qsize 1
2018-05-04 14:21:53,629 : INFO : EPOCH 5 - PROGRESS: at 37.94% examples, 273708 words/s, in_qsize 7, out_qsize 0
2018-05-04 14:21:54,662 : INFO : EPOCH 5 - PROGRESS: at 39.75% examples, 264763 words/s, in_qsize 8, out_qsize 0
2018-05-04 14:21:55,758 : INFO : EPOCH 5 - PROGRESS: at 42.89% examples, 263996 words/s, in_qsize 7, out_qsize 0
2018-05-04 14:21:56,818 : INFO : EPOCH 5 - PROGRESS: at 45.61% examples, 261608 words/s, in_qsize 7, out_qsize 0
2018-05-04 14:21:57,862 : INFO : EPOCH 5 - PROGRESS: at 48.51% examples, 260697 words/s, in_qsize 7, out_qsize 0
2018-05-04 14:21:58,907 : INFO : EPOCH 5 - PROGRESS: at 52.73% examples, 266084 words/s, in_qsize 8, out_qsize 0
2018-05-04 14:21:59,914 : INFO : EPOCH 5 - PROGRESS: at 56.74% examples, 271044 words/s, in_qsize 8, out_qsize 0
2018-05-04 14:22:00,934 : INFO : EPOCH 5 - PROGRESS: at 60.67% examples, 274968 words/s, in_qsiz

2018-05-04 14:22:59,046 : INFO : EPOCH 7 - PROGRESS: at 60.77% examples, 273681 words/s, in_qsize 7, out_qsize 0
2018-05-04 14:23:00,054 : INFO : EPOCH 7 - PROGRESS: at 64.52% examples, 276580 words/s, in_qsize 8, out_qsize 1
2018-05-04 14:23:01,098 : INFO : EPOCH 7 - PROGRESS: at 68.62% examples, 279794 words/s, in_qsize 6, out_qsize 1
2018-05-04 14:23:02,110 : INFO : EPOCH 7 - PROGRESS: at 72.67% examples, 282819 words/s, in_qsize 8, out_qsize 1
2018-05-04 14:23:03,128 : INFO : EPOCH 7 - PROGRESS: at 76.81% examples, 286095 words/s, in_qsize 7, out_qsize 0
2018-05-04 14:23:04,161 : INFO : EPOCH 7 - PROGRESS: at 81.11% examples, 289158 words/s, in_qsize 7, out_qsize 0
2018-05-04 14:23:05,188 : INFO : EPOCH 7 - PROGRESS: at 85.16% examples, 291580 words/s, in_qsize 7, out_qsize 0
2018-05-04 14:23:06,204 : INFO : EPOCH 7 - PROGRESS: at 89.39% examples, 294167 words/s, in_qsize 7, out_qsize 0
2018-05-04 14:23:07,218 : INFO : EPOCH 7 - PROGRESS: at 93.52% examples, 296579 words/s, in_qsiz

2018-05-04 14:24:01,995 : INFO : EPOCH - 9 : training on 12084660 raw words (8817833 effective words) took 26.0s, 338568 effective words/s
2018-05-04 14:24:03,044 : INFO : EPOCH 10 - PROGRESS: at 3.68% examples, 320843 words/s, in_qsize 7, out_qsize 0
2018-05-04 14:24:04,087 : INFO : EPOCH 10 - PROGRESS: at 7.59% examples, 322412 words/s, in_qsize 6, out_qsize 1
2018-05-04 14:24:05,091 : INFO : EPOCH 10 - PROGRESS: at 11.57% examples, 329454 words/s, in_qsize 7, out_qsize 0
2018-05-04 14:24:06,108 : INFO : EPOCH 10 - PROGRESS: at 15.61% examples, 333478 words/s, in_qsize 8, out_qsize 0
2018-05-04 14:24:07,111 : INFO : EPOCH 10 - PROGRESS: at 19.43% examples, 333702 words/s, in_qsize 7, out_qsize 0
2018-05-04 14:24:08,121 : INFO : EPOCH 10 - PROGRESS: at 23.38% examples, 334722 words/s, in_qsize 8, out_qsize 0
2018-05-04 14:24:09,142 : INFO : EPOCH 10 - PROGRESS: at 27.41% examples, 337134 words/s, in_qsize 7, out_qsize 0
2018-05-04 14:24:10,145 : INFO : EPOCH 10 - PROGRESS: at 31.12% e

# Exploring the model

How many words in the model and how many features

In [15]:
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(28322, 50)

In [16]:
model.wv.vocab

{'salesman': <gensim.models.keyedvectors.Vocab at 0x1dd3846bf98>,
 'groom': <gensim.models.keyedvectors.Vocab at 0x1dd368daba8>,
 'letting': <gensim.models.keyedvectors.Vocab at 0x1dd36b98128>,
 'unshaven': <gensim.models.keyedvectors.Vocab at 0x1dd36890d30>,
 'begun': <gensim.models.keyedvectors.Vocab at 0x1dd368da5c0>,
 'willis': <gensim.models.keyedvectors.Vocab at 0x1dd36b6c2e8>,
 'darnell': <gensim.models.keyedvectors.Vocab at 0x1dd36890cc0>,
 'geek': <gensim.models.keyedvectors.Vocab at 0x1dd368da438>,
 'overly': <gensim.models.keyedvectors.Vocab at 0x1dd36a838d0>,
 'benton': <gensim.models.keyedvectors.Vocab at 0x1dd36b98438>,
 'blissfully': <gensim.models.keyedvectors.Vocab at 0x1dd368da128>,
 'rally': <gensim.models.keyedvectors.Vocab at 0x1dd368e4400>,
 'covering': <gensim.models.keyedvectors.Vocab at 0x1dd36b98160>,
 'inspiring': <gensim.models.keyedvectors.Vocab at 0x1dd36b98550>,
 'ruin': <gensim.models.keyedvectors.Vocab at 0x1dd36890d68>,
 'boarding': <gensim.models.keye

Get an embedding for a word

In [17]:
model.wv['flower']

array([ 0.37915838,  0.09372403,  0.0262548 , -1.17967749,  0.39919809,
       -0.50871468,  0.46438596,  0.09218451, -0.03408242,  0.15108852,
        0.41516727,  1.12218118,  0.93676776,  0.22250989,  1.01879609,
        1.34593356,  0.2620495 ,  0.24139282,  0.99214309,  0.91421306,
       -0.23351651, -0.29771161,  0.74494773,  1.08225477, -0.04539989,
        0.27483591,  0.00826408, -0.55425483, -0.44391334, -0.92362022,
       -0.84691477,  1.1921947 , -0.67792529,  0.69149143, -0.3181636 ,
       -0.23127717,  0.37454739, -0.76166487,  0.36729097,  0.98324329,
       -1.54302824,  0.94953352, -2.00399041,  0.09720147,  0.02581038,
        0.49339962,  0.85171854, -0.50338542,  0.55421662, -0.53529894], dtype=float32)

Saving the model

In [18]:
model.save('word2vec-movie-50')

2018-05-04 14:25:24,511 : INFO : saving Word2Vec object under word2vec-movie-50, separately None
2018-05-04 14:25:24,519 : INFO : not storing attribute vectors_norm
2018-05-04 14:25:24,526 : INFO : not storing attribute cum_table
2018-05-04 14:25:25,144 : INFO : saved word2vec-movie-50


Finding Words which have similar meaning

In [19]:
model.wv.most_similar('fear')

2018-05-04 14:25:35,405 : INFO : precomputing L2-norms of word weight vectors


[('hatred', 0.7750701308250427),
 ('cruelty', 0.7367026805877686),
 ('pain', 0.7201778292655945),
 ('misery', 0.7148659825325012),
 ('humanity', 0.7008926868438721),
 ('confusion', 0.7002967000007629),
 ('anguish', 0.6962993741035461),
 ('loneliness', 0.6903893351554871),
 ('guilt', 0.6878209114074707),
 ('frustration', 0.6854299306869507)]

Find the Word which is not like others

In [20]:
model.doesnt_match("man king child queen".split())

  """Entry point for launching an IPython kernel.


'king'

1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [21]:
model.most_similar(positive=['king','man'], negative=['queen'])

  """Entry point for launching an IPython kernel.


[('scientist', 0.6305267810821533),
 ('nemesis', 0.5564342141151428),
 ('warlord', 0.5396575927734375),
 ('master', 0.5392469167709351),
 ('soldier', 0.5374979972839355),
 ('buio', 0.5320484638214111),
 ('son', 0.5183924436569214),
 ('golgo', 0.5137350559234619),
 ('toulon', 0.5130910277366638),
 ('maker', 0.509076714515686)]

Loading a model from Memory

In [24]:
model = gensim.models.Word2Vec.load('word2vec-movie-50')

2018-05-04 14:26:04,036 : INFO : loading Word2Vec object from word2vec-movie-50
2018-05-04 14:26:04,423 : INFO : loading wv recursively from word2vec-movie-50.wv.* with mmap=None
2018-05-04 14:26:04,427 : INFO : setting ignored attribute vectors_norm to None
2018-05-04 14:26:04,435 : INFO : loading vocabulary recursively from word2vec-movie-50.vocabulary.* with mmap=None
2018-05-04 14:26:04,455 : INFO : loading trainables recursively from word2vec-movie-50.trainables.* with mmap=None
2018-05-04 14:26:04,473 : INFO : setting ignored attribute cum_table to None
2018-05-04 14:26:04,491 : INFO : loaded word2vec-movie-50
