## Using Pre-trained Gensim Word2Vec model to find similar words

In [1]:
import nltk
from gensim.models import Word2Vec

from nltk.corpus import stopwords

import re

In [2]:
para = """I’m honored to be with you today because, let’s face it, you accomplished something I never could. If I get through this speech, it’ll be the first time I actually finish something at Harvard. Class of 2017, congratulations!

I’m an unlikely speaker, not just because I dropped out, but because we’re technically in the same generation. We walked this yard less than a decade apart, studied the same ideas and slept through the same Ec10 lectures. We may have taken different paths to get here, especially if you came all the way from the Quad, but today I want to share what I’ve learned about our generation and the world we’re building together.

But first, the last couple of days have brought back a lot of good memories.

How many of you remember exactly what you were doing when you got that email telling you that you got into Harvard? I was playing Civilization and I ran downstairs, got my dad, and for some reason, his reaction was to video me opening the email. That could have been a really sad video. I swear getting into Harvard is still the thing my parents are most proud of me for.

What about your first lecture at Harvard? Mine was Computer Science 121 with the incredible Harry Lewis. I was late so I threw on a t-shirt and didn’t realize until afterwards it was inside out and backwards with my tag sticking out the front. I couldn’t figure out why no one would talk to me — except one guy, KX Jin, he just went with it. We ended up doing our problem sets together, and now he runs a big part of Facebook. And that, Class of 2017, is why you should be nice to people."""

In [3]:
# Preprocessing

text = para.lower()
text = re.sub(r"\d+", " ", text)
text = re.sub(r"\s+", " ", text)

In [4]:
# creating a list of sentences
sentences = nltk.sent_tokenize(text)
print(sentences)

['i’m honored to be with you today because, let’s face it, you accomplished something i never could.', 'if i get through this speech, it’ll be the first time i actually finish something at harvard.', 'class of , congratulations!', 'i’m an unlikely speaker, not just because i dropped out, but because we’re technically in the same generation.', 'we walked this yard less than a decade apart, studied the same ideas and slept through the same ec lectures.', 'we may have taken different paths to get here, especially if you came all the way from the quad, but today i want to share what i’ve learned about our generation and the world we’re building together.', 'but first, the last couple of days have brought back a lot of good memories.', 'how many of you remember exactly what you were doing when you got that email telling you that you got into harvard?', 'i was playing civilization and i ran downstairs, got my dad, and for some reason, his reaction was to video me opening the email.', 'that c

In [5]:
# further converting each sentence into a list of words
sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
print(sentences)

[['i', '’', 'm', 'honored', 'to', 'be', 'with', 'you', 'today', 'because', ',', 'let', '’', 's', 'face', 'it', ',', 'you', 'accomplished', 'something', 'i', 'never', 'could', '.'], ['if', 'i', 'get', 'through', 'this', 'speech', ',', 'it', '’', 'll', 'be', 'the', 'first', 'time', 'i', 'actually', 'finish', 'something', 'at', 'harvard', '.'], ['class', 'of', ',', 'congratulations', '!'], ['i', '’', 'm', 'an', 'unlikely', 'speaker', ',', 'not', 'just', 'because', 'i', 'dropped', 'out', ',', 'but', 'because', 'we', '’', 're', 'technically', 'in', 'the', 'same', 'generation', '.'], ['we', 'walked', 'this', 'yard', 'less', 'than', 'a', 'decade', 'apart', ',', 'studied', 'the', 'same', 'ideas', 'and', 'slept', 'through', 'the', 'same', 'ec', 'lectures', '.'], ['we', 'may', 'have', 'taken', 'different', 'paths', 'to', 'get', 'here', ',', 'especially', 'if', 'you', 'came', 'all', 'the', 'way', 'from', 'the', 'quad', ',', 'but', 'today', 'i', 'want', 'to', 'share', 'what', 'i', '’', 've', 'lear

In [6]:
# dropping stopwords from each list of words (every sentence is converted into a list of words) 
for i in range(len(sentences)):
    sentences[i] = [word for word in sentences[i] if word not in stopwords.words("english")]

print(sentences)

[['’', 'honored', 'today', ',', 'let', '’', 'face', ',', 'accomplished', 'something', 'never', 'could', '.'], ['get', 'speech', ',', '’', 'first', 'time', 'actually', 'finish', 'something', 'harvard', '.'], ['class', ',', 'congratulations', '!'], ['’', 'unlikely', 'speaker', ',', 'dropped', ',', '’', 'technically', 'generation', '.'], ['walked', 'yard', 'less', 'decade', 'apart', ',', 'studied', 'ideas', 'slept', 'ec', 'lectures', '.'], ['may', 'taken', 'different', 'paths', 'get', ',', 'especially', 'came', 'way', 'quad', ',', 'today', 'want', 'share', '’', 'learned', 'generation', 'world', '’', 'building', 'together', '.'], ['first', ',', 'last', 'couple', 'days', 'brought', 'back', 'lot', 'good', 'memories', '.'], ['many', 'remember', 'exactly', 'got', 'email', 'telling', 'got', 'harvard', '?'], ['playing', 'civilization', 'ran', 'downstairs', ',', 'got', 'dad', ',', 'reason', ',', 'reaction', 'video', 'opening', 'email', '.'], ['could', 'really', 'sad', 'video', '.'], ['swear', 'ge

In [7]:
# if the word is not present at least 2 times, remove (ignore) it
model = Word2Vec(sentences, min_count=2)

In [8]:
words = model.wv.vocab
# print(words)

In [9]:
# finding word vector
vector = model.wv["harvard"]
print(vector)

[-0.00303428  0.00404894 -0.00244913 -0.00248026  0.003996    0.00236965
  0.00422836  0.00148972 -0.00062612  0.00204408  0.00439603 -0.00015394
  0.00068051  0.00092499  0.00035283 -0.00216949 -0.00144381  0.00324738
  0.00479151 -0.00085619  0.00079029  0.00318717  0.00440426  0.00076404
  0.00453461 -0.00461847  0.00090625  0.00193784 -0.00093581 -0.0036896
  0.00215838  0.00172147 -0.00269235  0.00214148 -0.00178107 -0.00107924
 -0.00264406  0.00189476  0.00256045  0.00371318 -0.00494314 -0.00342586
  0.00387853 -0.00180101 -0.00496756 -0.00333493 -0.00088091  0.00376276
 -0.00103155 -0.00334114 -0.00294978 -0.0020412   0.00363728  0.00117224
 -0.00323676  0.00232899  0.00240414 -0.00290166  0.00282261  0.00272644
  0.00230327  0.00086343  0.00496699 -0.00374963  0.00473863 -0.00480256
 -0.00139065 -0.0007083  -0.00330416  0.0012048  -0.00478155  0.00493229
 -0.00205918 -0.0043739  -0.00412051  0.00194477  0.00481531 -0.00483341
 -0.00376711  0.00272676  0.00465771 -0.00017847 -0.

In [10]:
# most similar words
similar = model.wv.most_similar("harvard")
print(similar)

[('generation', 0.1920119673013687), (',', 0.1262374222278595), ('get', 0.06421398371458054), ('’', 0.051325954496860504), ('got', 0.046337973326444626), ('?', 0.0068762339651584625), ('could', -0.008338935673236847), ('today', -0.02444247342646122), ('together', -0.030627883970737457), ('class', -0.0313822478055954)]
