##Word2Vec Demo##
From https://github.com/nltk/nltk/blob/develop/nltk/test/gensim.doctest


In [14]:
# to get gensim, to to https://radimrehurek.com/gensim/
# OR run this on your command line: easy_install -U gensim 

import nltk
import numpy as np
import gensim
from gensim.models import Word2Vec
from nltk.data import find
import pandas as pd


In [2]:
# To get the model file needed, do the following one time only:
#one time only: Run download; view the UI that pops up; switch to the models tab, and download the word2vec_sample model
# nltk.download()

In [86]:

word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.Word2Vec.load_word2vec_format(word2vec_sample, binary=False)

In [83]:
for doc in word2vec_sample:
    words = filter(lambda x: x in model.vocab, doc.words)

AttributeError: 'str' object has no attribute 'words'

# OLD STUFF FOR REFERENCE

We pruned the model to only include the most common words (~44k words).


In [4]:
len(model.vocab)

43981

Each word is represented in the space of 300 dimensions:


In [5]:
len(model['university'])

300

Finding the top n words that are similar to a target word is simple. The result is the list of n words with the score.


In [6]:
model.most_similar(positive=['university'], topn = 10)

[('universities', 0.7003918290138245),
 ('faculty', 0.6780907511711121),
 ('undergraduate', 0.6587095260620117),
 ('campus', 0.6434987783432007),
 ('college', 0.6385269165039062),
 ('academic', 0.6317198276519775),
 ('professors', 0.6298646926879883),
 ('undergraduates', 0.6149813532829285),
 ('University', 0.6139305233955383),
 ('student', 0.6005401611328125)]

Finding a word that is not in a list is also supported in the API.


In [7]:
model.doesnt_match('breakfast cereal dinner lunch'.split())

'cereal'

Mikolov et al. (2013) figured out the following famous exampe:  word embedding captures much of syntactic and semantic regularities. For example,
the vector 'King - Man + Woman' is close to 'Queen' and 'Germany - Berlin + Paris' is close to 'France'.

In [8]:
model.most_similar(positive=['woman','king'], negative=['man'], topn = 1)

[('queen', 0.7118192911148071)]

In [9]:
model.most_similar(positive=["face", "person", 'triumph', 'won'], topn = 10)

[('win', 0.6727384924888611),
 ('victory', 0.5993870496749878),
 ('wins', 0.5796988606452942),
 ('victories', 0.5518628358840942),
 ('winning', 0.5479387044906616),
 ('triumphs', 0.5340350866317749),
 ('clinch', 0.5326650142669678),
 ('victorious', 0.5290749073028564),
 ('faces', 0.5236936807632446),
 ('defeated', 0.5202665328979492)]

In [10]:
model.most_similar(positive=['Paris','Germany'], negative=['Berlin'], topn = 1)

[('France', 0.7884091138839722)]

In [11]:
model.most_similar(positive=['president', 'university'], topn=30)

[('chancellor', 0.6200418472290039),
 ('dean', 0.6120452880859375),
 ('President', 0.591903805732727),
 ('faculty', 0.5726973414421082),
 ('rector', 0.5606599450111389),
 ('presidents', 0.5546602606773376),
 ('Provost', 0.5418164730072021),
 ('regents', 0.5399488210678101),
 ('professors', 0.5367733240127563),
 ('universities', 0.5157524347305298),
 ('campus', 0.5094808340072632),
 ('student', 0.5033937692642212),
 ('academic', 0.5031865835189819),
 ('institute', 0.5005171895027161),
 ('undergraduate', 0.48198601603507996),
 ('Professors', 0.47340402007102966),
 ('professor', 0.47276201844215393),
 ('Faculty', 0.47209471464157104),
 ('chairman', 0.4699815511703491),
 ('professorship', 0.467648446559906),
 ('presidency', 0.46344324946403503),
 ('University', 0.45916348695755005),
 ('campuses', 0.45756882429122925),
 ('college', 0.45753854513168335),
 ('trustees', 0.45137834548950195),
 ('Chancellor', 0.4487611949443817),
 ('undergraduates', 0.4440937042236328),
 ('institution', 0.437450

You can train your own models.  Here is an example using NLTK corpora.  This will be an exercise in seeing how different corpora yield different results.

In [12]:
from nltk.corpus import brown
brown_model = gensim.models.Word2Vec(brown.sents())

# It might take some time to train the model. So, after it is trained, it can be saved as follows:

brown_model.save('brown.embedding')
new_model = gensim.models.Word2Vec.load('brown.embedding')

In [13]:
brown_model.most_similar('president')

[('Cardinals', 0.9014449715614319),
 ('upheld', 0.9003393650054932),
 ('Larson', 0.8979992866516113),
 ('Corp.', 0.8945410251617432),
 ('commissioner', 0.8785369992256165),
 ('Hengesbach', 0.877533495426178),
 ('Kong', 0.875813901424408),
 ('resignation', 0.8756513595581055),
 ('Grant', 0.8734716773033142),
 ('Football', 0.8717625141143799)]

# START EMOJINEERING 

In [49]:
def convert_scraped_txt(txt):
    with open("emoji_webscraped.txt") as f_in:
        titles = []
        descriptions = []
        annotations = []
        for line in f_in:
            line = line.strip()
            temp = line.split(", ")
            titles.append(temp[0])
            descriptions.append(temp[1])
            annotations.append(temp[2:len(temp)])
        return titles, descriptions, annotations

titles, descriptions, annotations = convert_scraped_txt("emoji_webscraped.txt")

In [50]:
print(len(titles))
print(len(descriptions))
print(len(annotations))

1282
1282
1282


In [59]:
d = {'titles' : (titles),
     'annotations' : (annotations),
     'descriptions': (descriptions)}
df = pd.DataFrame(d)

In [60]:
df.head()

Unnamed: 0,annotations,descriptions,titles
0,"[face, grin, person]",grinning face,U+1F600
1,"[eye, face, grin, person, smile]",grinning face with smiling eyes,U+1F601
2,"[face, joy, person, tear]",face with tears of joy,U+1F602
3,"[face, mouth, open, person, smile]",smiling face with open mouth,U+1F603
4,"[eye, face, mouth, open, person, smile]",smiling face with open mouth and smiling eyes,U+1F604


count vectorizer on annoations descriptions
clustering with binary data, possibly asocaition rules
tfidf
feature vector
k_means on either full vector (or on lower dimensional space)

In [112]:
list_titles = [list(item) for item in list(df.annotations)]
index_face_person = [index for index,value in enumerate(list_titles) if 'face' in value or 'person' in value]
print(len(index_face_person))
df_face_person = df.iloc[index_face_person]
print(df_face_person.shape)
df_face_person.head()

253
(253, 3)


Unnamed: 0,annotations,descriptions,titles
0,"[face, grin, person]",grinning face,U+1F600
1,"[eye, face, grin, person, smile]",grinning face with smiling eyes,U+1F601
2,"[face, joy, person, tear]",face with tears of joy,U+1F602
3,"[face, mouth, open, person, smile]",smiling face with open mouth,U+1F603
4,"[eye, face, mouth, open, person, smile]",smiling face with open mouth and smiling eyes,U+1F604


In [89]:
list_words = [word for item in list(df.annotations) for word in item]

In [90]:
nltk.FreqDist(list_words).most_common(50)

[('object', 345),
 ('flag', 268),
 ('other', 258),
 ('symbol', 244),
 ('nature', 236),
 ('person', 214),
 ('place', 179),
 ('face', 121),
 ('travel', 95),
 ('office', 85),
 ('animal', 84),
 ('sign', 64),
 ('word', 62),
 ('food', 57),
 ('time', 56),
 ('entertainment', 54),
 ('weather', 49),
 ('vehicle', 47),
 ('activity', 44),
 ('plant', 40),
 ('arrow', 39),
 ('sound', 38),
 ('sport', 37),
 ('body', 33),
 ('emotion', 32),
 ('communication', 31),
 ('clock', 29),
 ('hand', 29),
 ('zodiac', 26),
 ('tool', 26),
 ('clothing', 25),
 ('building', 23),
 ('island', 23),
 ('geometric', 22),
 ('japanese', 22),
 ('celebration', 22),
 ('heart', 22),
 ('space', 22),
 ('game', 20),
 ('mark', 18),
 ('smile', 18),
 ('ball', 17),
 ('eye', 17),
 ('religion', 15),
 ('fairy tale', 15),
 ('sweet', 14),
 ('moon', 14),
 ('prohibited', 14),
 ('no', 14),
 ('not', 14)]

In [79]:
def list_word2vec(_list):
    single_words = [model.most_similar(positive=item, topn = 1)  for item in _list]
    return single_words
            


In [80]:
list_word2vec(df.annotations)

KeyError: "word 'savouring' not in vocabulary"