In [None]:
#word2vec tutorial: https://rare-technologies.com/word2vec-tutorial/

In [1]:
import gensim, logging

# download this file and add it in a folder named "resources"
# this has to be in the same folder where you have "Complete Code, "Empty Code, etc"
# link: https://drive.google.com/drive/folders/1AGq5h67_m8D6JNj9va1opdF4oz8AQvwq?usp=sharing

# this pre-trained model is organized like this: word = embeddings

model = gensim.models.KeyedVectors.load_word2vec_format('../resources/small-embeddings.txt', binary=False)


In [18]:
# is the corpus lowercased?
print (model["sidney"])

[-0.27875   0.81053  -0.056025 -0.50963   0.38158   0.95391  -0.75521
 -0.08793  -0.28277   0.52398  -0.344     1.061    -0.45144  -0.20914
 -0.28174   0.13764   0.94775   0.036071 -1.0709    0.72811   0.80155
  0.75824   0.53197  -0.72235  -0.21943  -0.80348   0.54585  -0.21601
 -0.24227  -0.70974  -0.42725  -0.085839  0.27275  -0.94201   0.6457
  0.21807   0.22465   0.67688   0.74787  -0.25021   0.64739   0.73452
 -0.44746  -0.98194   0.39264   0.79881  -0.65376  -0.33434   0.026907
  0.57293 ]


In [None]:
#to see the embeddings of a word, you just do:



In [12]:
#finding similar words

model.wv.most_similar(positive=["york"])

[('chicago', 0.9128427505493164),
 ('boston', 0.8992941975593567),
 ('angeles', 0.8561200499534607),
 ('d.c.', 0.8524180054664612),
 ('philadelphia', 0.8405068516731262),
 ('new', 0.8292818069458008),
 ('manhattan', 0.8184645175933838),
 ('washington', 0.8131545782089233),
 ('seattle', 0.7988473773002625),
 ('houston', 0.7823652625083923)]

In [13]:
# get relatedness

print (model.wv.similarity("obama","clinton"))

0.9606045352860738


In [14]:
# you can represent the meaning of an article, by the average of their embeddings
# let's compute the embeddings for an article
import codecs

article = codecs.open("../datasets/CleanedArticles/15.txt","r","utf-8")
article = article.read()
print (article)

Advertisement By JACEY FORTINDEC. 31, 2017  In Sydney, rainbow fireworks sparkled off the Harbour Bridge in celebration of Australia’s recent legalization of gay marriage. (Sydney was among the first major cities to celebrate with fireworks at the stroke of midnight.) In Japan, people paraded in fox masks to attend the first prayer of the year at a Shinto shrine in Tokyo. In the Philippines, revelers gathered — phones in hand — at the Eastwood Mall in Manila to watch balloons and confetti rain down at midnight. Big pots of tea were prepared for New Year’s Eve celebrations in Beijing. The country will also celebrate the Lunar New Year, in February. It was raining in Singapore, but New Year’s Eve celebrants sheltered under umbrellas and raincoats as fireworks sparkled overhead. Tourists donned party hats to watch fireworks in front of the famous Petronas Twin Towers in Kuala Lumpur, Malaysia. Hundreds of couples got married at a mass wedding in Jakarta on New Year’s Eve. We’re interested

In [19]:
import codecs, nltk, string
from nltk.corpus import stopwords

exclude = set(string.punctuation)
stop_word_list = stopwords.words('english')

# input should be a string - we need a simple pipeline for preparing the text that should be matched with the word embedding vocabulary
def nlp_simple_pipeline(text):
    
    #it depends if the words have been lowercased or not
    text = text.lower()
    
    text = nltk.word_tokenize(text)
        
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    # as usual, be careful with this
    text = [token for token in text if token not in stop_word_list]

    return text

In [None]:
# why not stemming or lemmatization?

In [20]:
cleaned_article = nlp_simple_pipeline(article)
print (cleaned_article)

['advertisement', 'jacey', 'fortindec', 'sydney', 'rainbow', 'fireworks', 'sparkled', 'harbour', 'bridge', 'celebration', 'australia', 'recent', 'legalization', 'gay', 'marriage', 'sydney', 'among', 'first', 'major', 'cities', 'celebrate', 'fireworks', 'stroke', 'midnight', 'japan', 'people', 'paraded', 'fox', 'masks', 'attend', 'first', 'prayer', 'year', 'shinto', 'shrine', 'tokyo', 'philippines', 'revelers', 'gathered', 'phones', 'hand', 'eastwood', 'mall', 'manila', 'watch', 'balloons', 'confetti', 'rain', 'midnight', 'big', 'pots', 'tea', 'prepared', 'new', 'year', 'eve', 'celebrations', 'beijing', 'country', 'also', 'celebrate', 'lunar', 'new', 'year', 'february', 'raining', 'singapore', 'new', 'year', 'eve', 'celebrants', 'sheltered', 'umbrellas', 'raincoats', 'fireworks', 'sparkled', 'overhead', 'tourists', 'donned', 'party', 'hats', 'watch', 'fireworks', 'front', 'famous', 'petronas', 'twin', 'towers', 'kuala', 'lumpur', 'malaysia', 'hundreds', 'couples', 'got', 'married', 'mas

In [22]:
# for each word, load embeddings
for word in cleaned_article:
    print (word)
    embed_word = model[word]
    print ("ok, I have it!")

advertisement
ok, I have it!
jacey
ok, I have it!
fortindec


KeyError: "word 'fortindec' not in vocabulary"

In [None]:
# handling exceptions
for word in cleaned_article:
    try:
        embed_word = model[word]
    except KeyError:


In [None]:
article_embedd = []

# for each word in the article, you take the embeddings
for word in cleaned_article:
    try:
        embed_word = model[word]
        article_embedd.append(embed_word)
    except KeyError:
        continue

print (len(article_embedd))

In [None]:
# average vectors of all words
doc_emb = [float(sum(col))/len(col) for col in zip(*article_embedd)]

In [None]:
# homework: try to write a function that does the same
def create_doc_embedding(cleaned_article):
    
    # ....
    
    return doc_emb