# Word Representations

## *"I know words. I have the best words!"*
    - Noam Chomsky

# Dense Distributed Representations

In [None]:
import pandas as pd
df = pd.read_csv('../data/reviews.full.tsv', sep='\t', nrows=100000)
documents = df.text.tolist()
print(documents[:2])

## Word embeddings with `Word2vec`

In [None]:
from gensim.models import Word2Vec
from gensim.models.word2vec import FAST_VERSION

corpus = [document.split() for document in documents]

# initialize model
w2v_model = Word2Vec(size=100,
                     window=15,
                     sample=0.0001,
                     iter=200,
                     negative=5, 
                     min_count=100,
                     workers=-1, 
                     hs=0
)

w2v_model.build_vocab(corpus)

w2v_model.train(corpus, 
                total_examples=w2v_model.corpus_count, 
                epochs=w2v_model.epochs)


Now, we can use the embeddings of the model

In [None]:
w2v_model.wv['delivery']

In [None]:
w2v_model.wv.most_similar(['delivery'])

In [None]:
# birthday - present + husband => birthday:present as husband:?
w2v_model.wv.most_similar(positive=['dog', 'husband'], negative=['cat'], topn=3)

In [None]:
word1 = "Cheapest"
word2 = "friendly"

# retrieve the actual vector
# print(w2v_model.wv[word1])

# compare
print(w2v_model.wv.similarity(word1, word2))

# get the 3 most similar words
print(w2v_model.wv.most_similar(word1, topn=3))


In [None]:
corpus[1]


### Exercise
Use `spacy` to restrict the words in the tweets to *content words*, i.e., nouns, verbs, and adjectives. Transform the words to lower case and add the POS with an underderscore. E.g.:

`love_VERB old-fashioneds_NOUN`

This also allows us to distinguish between homographs, i.e., words that are written the same, but belong to different word classes, e.g., *love* in "I **love** old-fashioneds" vs. "He felt so sick, it must have been **love**".


Make sure to exclude sentences that contain none of the above.

Write the resulting corpus to a variable called `word_corpus`.

In [None]:
# Your code here


Rerun the `Word2vec` model from above on the new data set and test the words out

In [None]:
# Your code here

## Exercise

Train 4 more `Word2vec` models and average the resulting embedding matrices.

In [None]:
# Your code here

## Document embeddings with `Doc2Vec`

In [None]:
df.head()

In [None]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import FAST_VERSION
from gensim.models.doc2vec import TaggedDocument

corpus = []

for row in df.iterrows():
    label = row[1].score
    text = row[1].text
    corpus.append(TaggedDocument(words=text.split(), tags=[str(label)]))

print('done')
d2v_model = Doc2Vec(vector_size=100, 
                    window=15,
                    hs=0,
                    sample=0.000001,
                    negative=5,
                    min_count=100,
                    workers=-1,
                    epochs=500,
                    dm=0, 
                    dbow_words=1)

d2v_model.build_vocab(corpus)

d2v_model.train(corpus, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

We can now look at the elements

In [None]:
d2v_model.docvecs.doctags

In [None]:
target_doc = '1'

similar_docs = d2v_model.docvecs.most_similar(target_doc, topn=5)
print(similar_docs)

## Exercise

What are the 10 most similar ***words*** to each category?

In [None]:
# your code here
d2v_model.wv.most_similar([d2v_model.docvecs['3']], topn=5)