### Load Gensim Library

In [1]:
!pip install gensim



In [0]:
import gensim

In [0]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

In [4]:
!ls

sample_data


### Load Text Data

Data can be downloaded from https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [5]:
#This is needed only if you have uploaded data to Google drive
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [6]:
import pandas as pd

#change file path to point to where you have stored the zip file.
df = pd.read_csv('/content/drive/My Drive/classNotes/statisticalNLP/unlabeledTrainData.tsv.zip', header=0, delimiter="\t", quoting=3)

print('Number of examples in Dataset: ', df.shape)
df.head()

Number of examples in Dataset:  (50000, 2)


Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


### Function to Clean up data

In [0]:
import re, string

def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)	
  except:
    return ""

### Clean the Data using routine above

In [13]:
df['clean_review'] = df['review'].apply(clean_str)
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


In [14]:
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


### Convert Review to a Word List

In [15]:
#List to hold all words in each review
documents = []

#Iterate over each review
for doc in df['clean_review']:
    documents.append(doc.split(' '))

print(len(documents))
print(documents[0])

50000
['watching', 'time', 'chasers', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friends', 'maybe', 'they', 'were', 'sitting', 'around', 'one', 'day', 'in', 'film', 'school', 'and', 'said', 'hey', 'let', 's', 'pool', 'our', 'money', 'together', 'and', 'make', 'a', 'really', 'bad', 'movie', 'or', 'something', 'like', 'that', 'what', 'ever', 'they', 'said', 'they', 'still', 'ended', 'up', 'making', 'a', 'really', 'bad', 'movie', 'dull', 'story', 'bad', 'script', 'lame', 'acting', 'poor', 'cinematography', 'bottom', 'of', 'the', 'barrel', 'stock', 'music', 'etc', 'all', 'corners', 'were', 'cut', 'except', 'the', 'one', 'that', 'would', 'have', 'prevented', 'this', 'film', 's', 'release', 'life', 's', 'like', 'that']


In [16]:
len(documents[108])

121

### Build the Model

In [17]:
#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=10, #Ignore all words with total frequency lower than this                           
                               workers=4, #Number of CPU Cores
                               size=50,  #Embedding size
                               window=5, #Maximum Distance between current and predicted word
                               iter=10   #Number of iterations over the text corpus
                              )  

2019-12-07 00:46:02,767 : INFO : collecting all words and their counts
2019-12-07 00:46:02,770 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-12-07 00:46:03,299 : INFO : PROGRESS: at sentence #10000, processed 2399440 words, keeping 51654 word types
2019-12-07 00:46:03,825 : INFO : PROGRESS: at sentence #20000, processed 4835846 words, keeping 69077 word types
2019-12-07 00:46:04,385 : INFO : PROGRESS: at sentence #30000, processed 7267977 words, keeping 81515 word types
2019-12-07 00:46:04,930 : INFO : PROGRESS: at sentence #40000, processed 9669772 words, keeping 91685 word types
2019-12-07 00:46:05,478 : INFO : collected 100479 word types from a corpus of 12084660 raw words and 50000 sentences
2019-12-07 00:46:05,479 : INFO : Loading a fresh vocabulary
2019-12-07 00:46:06,042 : INFO : effective_min_count=10 retains 28322 unique words (28% of original 100479, drops 72157)
2019-12-07 00:46:06,047 : INFO : effective_min_count=10 leaves 11910457 word cor

# Exploring the model

### How many words in the model

In [18]:
#Model size
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(28322, 50)

In [19]:
# Vocablury of the model
model.wv.vocab

{'watching': <gensim.models.keyedvectors.Vocab at 0x7f1033fc3278>,
 'time': <gensim.models.keyedvectors.Vocab at 0x7f1059e1d668>,
 'chasers': <gensim.models.keyedvectors.Vocab at 0x7f1059e25b70>,
 'it': <gensim.models.keyedvectors.Vocab at 0x7f100ca6b278>,
 'obvious': <gensim.models.keyedvectors.Vocab at 0x7f100ca6b198>,
 'that': <gensim.models.keyedvectors.Vocab at 0x7f100ca6b3c8>,
 'was': <gensim.models.keyedvectors.Vocab at 0x7f100ca6b2b0>,
 'made': <gensim.models.keyedvectors.Vocab at 0x7f100ca6b400>,
 'by': <gensim.models.keyedvectors.Vocab at 0x7f100ca6b240>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7f100ca6b160>,
 'bunch': <gensim.models.keyedvectors.Vocab at 0x7f100ca6b1d0>,
 'of': <gensim.models.keyedvectors.Vocab at 0x7f100ca6b438>,
 'friends': <gensim.models.keyedvectors.Vocab at 0x7f100ca6b470>,
 'maybe': <gensim.models.keyedvectors.Vocab at 0x7f100ca6b4a8>,
 'they': <gensim.models.keyedvectors.Vocab at 0x7f100ca6b4e0>,
 'were': <gensim.models.keyedvectors.Vocab at 0x7f

### Get an embedding for a word

In [20]:
model.wv['flower']

array([-0.6986385 , -1.1521771 , -0.38489717, -0.20228364, -0.44553578,
       -0.00937   , -0.16896184, -0.57302314, -0.28248858, -0.37187326,
        1.7044677 , -0.26498455,  0.7375803 , -0.18606572, -0.46843514,
        0.15523411,  0.18438125,  0.986107  ,  0.11671095,  0.09187862,
       -0.7966294 , -0.46644554,  0.33725765, -2.2658787 ,  0.21272008,
        0.05462522,  1.0833925 , -1.0689386 ,  0.09475855,  0.6652133 ,
        0.45978713, -0.6511303 , -0.778628  ,  1.2860738 ,  0.28800553,
        0.39701656, -0.663345  ,  1.4186627 ,  0.9362171 ,  0.01667313,
       -0.6179613 ,  0.3351227 , -0.02626111, -0.7527745 ,  0.31824034,
        0.58760625,  0.3418974 , -0.49677905, -0.18857573, -1.2116163 ],
      dtype=float32)

### Finding Words which have similar meaning

In [21]:
model.wv.most_similar('great')

2019-12-07 00:50:19,402 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('fantastic', 0.8884958028793335),
 ('terrific', 0.8786453008651733),
 ('wonderful', 0.8739440441131592),
 ('fine', 0.8313859105110168),
 ('good', 0.8302212953567505),
 ('brilliant', 0.8095049858093262),
 ('superb', 0.7917463183403015),
 ('perfect', 0.7840341925621033),
 ('nice', 0.7647720575332642),
 ('fabulous', 0.7349982857704163)]

### Find the word which is not like others

In [27]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


'kitchen'

### Saving the model

In [28]:
model.save('word2vec-movie-50')

2019-12-07 00:52:00,362 : INFO : saving Word2Vec object under word2vec-movie-50, separately None
2019-12-07 00:52:00,364 : INFO : not storing attribute vectors_norm
2019-12-07 00:52:00,368 : INFO : not storing attribute cum_table
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-12-07 00:52:00,593 : INFO : saved word2vec-movie-50


In [29]:
#Load model from memory
model = gensim.models.Word2Vec.load('word2vec-movie-50')

2019-12-07 00:52:05,128 : INFO : loading Word2Vec object from word2vec-movie-50
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-12-07 00:52:05,285 : INFO : loading wv recursively from word2vec-movie-50.wv.* with mmap=None
2019-12-07 00:52:05,287 : INFO : setting ignored attribute vectors_norm to None
2019-12-07 00:52:05,289 : INFO : loading vocabulary recursively from word2vec-movie-50.vocabulary.* with mmap=None
2019-12-07 00:52:05,291 : INFO : loading trainables recursively from word2vec-movie-50.trainables.* with mmap=None
2019-12-07 00:52:05,292 : INFO : setting ignored attribute cum_table to None
2019-12-07 00:52:05,293 : INFO : loaded word2vec-movie-50


1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [30]:
model.most_similar(positive=['king','man'], negative=['queen'])

  """Entry point for launching an IPython kernel.
2019-12-07 00:52:17,677 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('scientist', 0.5845720767974854),
 ('joker', 0.5635032057762146),
 ('mastermind', 0.5621315240859985),
 ('toulon', 0.5580616593360901),
 ('hector', 0.5526635050773621),
 ('puppet', 0.5492327213287354),
 ('master', 0.5439491271972656),
 ('warlord', 0.5411689281463623),
 ('shimura', 0.537550151348114),
 ('buio', 0.5331234335899353)]

In [31]:
model.wv['king'] + model.wv['man'] - model.wv['queen']

array([-4.0110874 ,  3.1203845 , -1.4800844 ,  0.96598756, -0.35236132,
        3.0539253 , -5.279897  ,  0.85881996, -0.4635051 , -0.09170306,
       -2.176845  ,  6.954863  ,  3.7289171 , -1.4919454 ,  1.7240916 ,
        2.4132953 , -0.11126602,  2.6064959 ,  0.49891257, -4.3038836 ,
       -0.8722282 ,  1.2754617 , -2.9492958 , -1.9411099 ,  1.5082257 ,
        0.9865351 ,  0.0898366 , -2.7046409 ,  1.2646043 ,  4.4898133 ,
        1.2258763 ,  0.96033   , -5.4216685 ,  3.1572387 ,  0.5975101 ,
        1.2290099 ,  1.6515203 , -2.2924511 , -3.153595  ,  3.18757   ,
        2.6550794 ,  2.1473522 , -3.9034522 , -2.3374574 , -3.5272598 ,
       -2.0754042 , -2.6888778 ,  0.65288734,  2.8974347 , -1.7383344 ],
      dtype=float32)