### Load Gensim Library

In [1]:
!pip install gensim



In [2]:
import gensim

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [0]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

In [4]:
!ls

sample_data


### Load Text Data

In [6]:
import pandas as pd
df = pd.read_csv('/gdrive/My Drive/AI-ML/unlabeledTrainData.tsv.zip', header=0, delimiter="\t", quoting=3)

print('Number of examples in Dataset: ', df.shape)
df.head()

Number of examples in Dataset:  (50000, 2)


Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


### Function to Clean up data

In [0]:
import re, string

def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)	
  except:
    return ""

### Clean the Data using routine above

In [8]:
df['clean_review'] = df['review'].apply(clean_str)
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


### Convert Review to a Word List

In [9]:
#List to hold all words in each review
documents = []

#Iterate over each review
for doc in df['clean_review']:
    documents.append(doc.split(' '))

print(len(documents))
print(documents[0])

50000
['watching', 'time', 'chasers', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friends', 'maybe', 'they', 'were', 'sitting', 'around', 'one', 'day', 'in', 'film', 'school', 'and', 'said', 'hey', 'let', 's', 'pool', 'our', 'money', 'together', 'and', 'make', 'a', 'really', 'bad', 'movie', 'or', 'something', 'like', 'that', 'what', 'ever', 'they', 'said', 'they', 'still', 'ended', 'up', 'making', 'a', 'really', 'bad', 'movie', 'dull', 'story', 'bad', 'script', 'lame', 'acting', 'poor', 'cinematography', 'bottom', 'of', 'the', 'barrel', 'stock', 'music', 'etc', 'all', 'corners', 'were', 'cut', 'except', 'the', 'one', 'that', 'would', 'have', 'prevented', 'this', 'film', 's', 'release', 'life', 's', 'like', 'that']


### Build the Model

In [10]:
#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=10, #Ignore all words with total frequency lower than this                           
                               workers=4, #Number of CPUs
                               size=50,  #Embedding size
                               window=5, #Maximum Distance between current and predicted word
                               iter=10   #Number of iterations over the text corpus
                              )  

2019-04-13 07:01:26,250 : INFO : collecting all words and their counts
2019-04-13 07:01:26,251 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-04-13 07:01:26,850 : INFO : PROGRESS: at sentence #10000, processed 2399440 words, keeping 51654 word types
2019-04-13 07:01:27,496 : INFO : PROGRESS: at sentence #20000, processed 4835846 words, keeping 69077 word types
2019-04-13 07:01:28,097 : INFO : PROGRESS: at sentence #30000, processed 7267977 words, keeping 81515 word types
2019-04-13 07:01:28,709 : INFO : PROGRESS: at sentence #40000, processed 9669772 words, keeping 91685 word types
2019-04-13 07:01:29,331 : INFO : collected 100479 word types from a corpus of 12084660 raw words and 50000 sentences
2019-04-13 07:01:29,332 : INFO : Loading a fresh vocabulary
2019-04-13 07:01:29,835 : INFO : effective_min_count=10 retains 28322 unique words (28% of original 100479, drops 72157)
2019-04-13 07:01:29,836 : INFO : effective_min_count=10 leaves 11910457 word cor

# Exploring the model

### How many words in the model

In [11]:
#Model size
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(28322, 50)

In [12]:
# Vocablury of the model
model.wv.vocab

{'watching': <gensim.models.keyedvectors.Vocab at 0x7fda01803550>,
 'time': <gensim.models.keyedvectors.Vocab at 0x7fda018037b8>,
 'chasers': <gensim.models.keyedvectors.Vocab at 0x7fda017f4e80>,
 'it': <gensim.models.keyedvectors.Vocab at 0x7fda017f4c88>,
 'obvious': <gensim.models.keyedvectors.Vocab at 0x7fda017f4fd0>,
 'that': <gensim.models.keyedvectors.Vocab at 0x7fd9da0a9c18>,
 'was': <gensim.models.keyedvectors.Vocab at 0x7fd9da0a9c88>,
 'made': <gensim.models.keyedvectors.Vocab at 0x7fd9da0a9940>,
 'by': <gensim.models.keyedvectors.Vocab at 0x7fd9da0a9ac8>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7fd9da0a9b38>,
 'bunch': <gensim.models.keyedvectors.Vocab at 0x7fd9da0a9ba8>,
 'of': <gensim.models.keyedvectors.Vocab at 0x7fd9da0a9b70>,
 'friends': <gensim.models.keyedvectors.Vocab at 0x7fd9da0a9b00>,
 'maybe': <gensim.models.keyedvectors.Vocab at 0x7fd9da0a9cc0>,
 'they': <gensim.models.keyedvectors.Vocab at 0x7fd9da0a9cf8>,
 'were': <gensim.models.keyedvectors.Vocab at 0x7f

### Get an embedding for a word

In [15]:
model.wv['flower']

array([ 0.7568026 , -0.18180634, -0.17753159, -0.67494357, -1.5582796 ,
        0.08646239,  0.6238796 , -0.17049365,  1.2921526 , -0.87824315,
        0.95820993, -1.087397  ,  0.87916046, -0.57581604,  0.5137825 ,
       -0.11233454, -0.18431784, -1.3852475 ,  0.41678172, -0.0754873 ,
       -0.47281814,  0.8488433 , -0.10011446,  1.1238286 ,  0.7669508 ,
       -0.10805985,  0.54498583, -0.9058127 ,  0.4636573 ,  0.91139543,
       -0.8717332 ,  0.74897826,  0.37250862,  1.152637  ,  0.87119037,
       -0.02884753, -0.29983905, -0.2240433 , -0.24067906, -1.3407695 ,
       -0.3668513 , -0.89341813,  0.32513633, -0.35597268,  0.69412   ,
        0.24599057, -0.48554245,  0.61435413,  1.3205293 , -0.41895688],
      dtype=float32)

### Finding Words which have similar meaning

In [16]:
model.wv.most_similar('great')

2019-04-13 07:07:56,529 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('fantastic', 0.8930344581604004),
 ('terrific', 0.8748549222946167),
 ('wonderful', 0.8648861646652222),
 ('good', 0.8316959142684937),
 ('fine', 0.829056441783905),
 ('brilliant', 0.8062900304794312),
 ('superb', 0.7853832840919495),
 ('perfect', 0.7718095183372498),
 ('nice', 0.7465071678161621),
 ('amazing', 0.743908703327179)]

### Find the word which is not like others

In [17]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


'kitchen'

### Saving the model

In [18]:
model.save('word2vec-movie-50')

2019-04-13 07:19:25,328 : INFO : saving Word2Vec object under word2vec-movie-50, separately None
2019-04-13 07:19:25,330 : INFO : not storing attribute vectors_norm
2019-04-13 07:19:25,333 : INFO : not storing attribute cum_table
2019-04-13 07:19:25,575 : INFO : saved word2vec-movie-50


In [0]:
#Load model from memory
model = gensim.models.Word2Vec.load('word2vec-movie-50')

1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [19]:
model.most_similar(positive=['king','man'], negative=['queen'])

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('soldier', 0.5443345308303833),
 ('enforcer', 0.5437442660331726),
 ('master', 0.5365206003189087),
 ('scientist', 0.5275171399116516),
 ('toulon', 0.5243131518363953),
 ('joker', 0.5217767357826233),
 ('buio', 0.5210888385772705),
 ('vet', 0.511581540107727),
 ('batman', 0.5082988739013672),
 ('yakuza', 0.5009059906005859)]