## Semantic Review:

In [1]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim.models import Word2Vec

In [2]:
from nltk.corpus import brown, movie_reviews, treebank
br = Word2Vec(brown.sents())
mr = Word2Vec(movie_reviews.sents())
tb = Word2Vec(treebank.sents())

In [4]:
br.wv.most_similar('money', topn=5)

[('care', 0.9047278165817261),
 ('job', 0.9020060300827026),
 ('trouble', 0.8821300268173218),
 ('chance', 0.8704777359962463),
 ('work', 0.8690633177757263)]

In [5]:
mr.wv.most_similar('money', topn=5)

[('him', 0.7789386510848999),
 ('attention', 0.7512477040290833),
 ('trouble', 0.7276182174682617),
 ('eyes', 0.7269654870033264),
 ('ready', 0.7255377769470215)]

In [6]:
tb.wv.most_similar('money', topn=5)

[('when', 0.999602735042572),
 ('traders', 0.9995506405830383),
 ('only', 0.9995391964912415),
 ('into', 0.9995371699333191),
 ('managers', 0.9995243549346924)]

## Example with your own dataset

In [7]:
text = open('datasets/carl_sagan_quote.txt').read()
text

"What an astonishing thing a book is. It's a flat object made from a tree with flexible parts on which are imprinted lots of funny dark squiggles. But one glance at it and you're inside the mind of another person, maybe somebody dead for thousands of years. Across the millennia, an author is speaking clearly and silently inside your head, directly to you. Writing is perhaps the greatest of human inventions, binding together people who never knew each other, citizens of distant epochs. Books break the shackles of time. A book is proof that humans are capable of working magic."

In [8]:
def preprocessing(text):
    result = []
    sent = sent_tokenize(text)
    for sentence in sent:
        words = word_tokenize(sentence)
        tokens = [w for w in words if w.lower() not in string.punctuation]
        stopw = stopwords.words('english')
        tokens = [token for token in tokens if token not in stopw]
    # remove words less than three letters
        tokens = [word for word in tokens if len(word)>=3]
    # lemmatize
        lemma = WordNetLemmatizer()
        tokens = [lemma.lemmatize(word) for word in tokens]
        result += [tokens] 
    return result 

In [9]:
text_p = preprocessing(text)
text_p

[['What', 'astonishing', 'thing', 'book'],
 ['flat',
  'object',
  'made',
  'tree',
  'flexible',
  'part',
  'imprinted',
  'lot',
  'funny',
  'dark',
  'squiggle'],
 ['But',
  'one',
  'glance',
  "'re",
  'inside',
  'mind',
  'another',
  'person',
  'maybe',
  'somebody',
  'dead',
  'thousand',
  'year'],
 ['Across',
  'millennium',
  'author',
  'speaking',
  'clearly',
  'silently',
  'inside',
  'head',
  'directly'],
 ['Writing',
  'perhaps',
  'greatest',
  'human',
  'invention',
  'binding',
  'together',
  'people',
  'never',
  'knew',
  'citizen',
  'distant',
  'epoch'],
 ['Books', 'break', 'shackle', 'time'],
 ['book', 'proof', 'human', 'capable', 'working', 'magic']]

In [11]:
# size – Denotes the number of dimensions present in the vectorial forms.
# If you have read the document and have an idea of how many ‘topics’ it has, you can use that number
# sg = 0 for CBOW model and 1 for skip-gram model
# min_count: Ignore all words with total frequency lower than this
# window: the maximum distance between the current and predicted word within
# a sentence.
model = Word2Vec(text_p, min_count=1, sg=1, window =3)
model

<gensim.models.word2vec.Word2Vec at 0x269ef9d0b20>

In [12]:
model.wv.most_similar(positive=['millennium','human'], negative=['magic'], topn=1)

[('year', 0.21988043189048767)]

In [13]:
model.wv.most_similar_cosmul(positive=['millennium', 'human'], negative=['magic'], topn=1)

[('year', 0.7606932520866394)]

In [14]:
model.wv.most_similar(positive=['millennium','human','magic'], topn=1)

[('person', 0.26482051610946655)]

In [15]:
model.wv.doesnt_match("millennium human magic book".split())

'magic'

In [16]:
model.wv.similarity('book', 'invention')

-0.06843189

In [17]:
# The word vectors are stored in a KeyedVectors instance in model.wv. 
#This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec

In [18]:
# word vector: vectorial representation.
model.wv['book']

array([-5.3563481e-04,  2.3609448e-04,  5.1024966e-03,  9.0092830e-03,
       -9.3017444e-03, -7.1180812e-03,  6.4584552e-03,  8.9766849e-03,
       -5.0170152e-03, -3.7658883e-03,  7.3811240e-03, -1.5342073e-03,
       -4.5357402e-03,  6.5539419e-03, -4.8570656e-03, -1.8155428e-03,
        2.8765933e-03,  9.9167612e-04, -8.2854982e-03, -9.4494354e-03,
        7.3100310e-03,  5.0693210e-03,  6.7569013e-03,  7.6063070e-04,
        6.3500670e-03, -3.4060956e-03, -9.4749324e-04,  5.7659470e-03,
       -7.5205108e-03, -3.9364761e-03, -7.5128879e-03, -9.3162252e-04,
        9.5380517e-03, -7.3207384e-03, -2.3316094e-03, -1.9360896e-03,
        8.0759106e-03, -5.9306626e-03,  4.5434106e-05, -4.7544036e-03,
       -9.6032452e-03,  5.0068637e-03, -8.7601868e-03, -4.3897266e-03,
       -3.3673325e-05, -2.9875373e-04, -7.6621128e-03,  9.6147563e-03,
        4.9820123e-03,  9.2332605e-03, -8.1567885e-03,  4.4964016e-03,
       -4.1376371e-03,  8.2622562e-04,  8.4987478e-03, -4.4611846e-03,
      

In [22]:
vocab = list(model.wv.index_to_key)
vocab[:10]

['book',
 'inside',
 'human',
 'magic',
 'squiggle',
 'dead',
 'somebody',
 'maybe',
 'person',
 'another']

## Storing and Loading models:

In [24]:
model.save('datasets/word2vec_model')
new_model = Word2Vec.load('datasets/word2vec_model')

In [25]:
new_model.wv.most_similar(positive=['human','magic'], topn=3)

[('binding', 0.20213058590888977),
 ('break', 0.19651201367378235),
 ('person', 0.19567753374576569)]

## Metrics:

In [26]:
model_ = Word2Vec(text_p, min_count=1, sg=1, window =3, hs=1, negative=0)
model_.score(["The cosmos a space time odyssey".split()]) #Probability of a text under the model

array([0.], dtype=float32)

## Reference:

https://radimrehurek.com/gensim/models/word2vec.html

https://pypi.python.org/pypi/gensim

https://radimrehurek.com/gensim/

https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec