# Word Representations

## *"I know words. I have the best words!"*
    - Noam Chomsky

## Discrete Sparse Representations

In [14]:
import pandas as pd
df = pd.read_csv('../data/reviews.full.tsv', sep='\t')
documents = df.text.tolist()
print(documents[:2])

["Prices change daily and if you want to really research the price continually at many different sites , I have found cheaper cars elsewhere . However , if you don ' t have a lot of time to research the price , this site has always been among the top three ( e . g ., cheapest ) of the ten sites I use to reserve a car .", 'and the fact that they will match other companies is awesome !!']


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

sentences_2 = documents[:1]

small_vectorizer = CountVectorizer()

X1 = small_vectorizer.fit_transform(sentences_2)

The result is a *sparse count matrix*:

In [5]:
# indexed representation
print(X1)

# dense representation
print(X1.todense())

  (0, 5)	1
  (0, 27)	1
  (0, 37)	1
  (0, 30)	1
  (0, 9)	1
  (0, 33)	1
  (0, 36)	1
  (0, 1)	1
  (0, 4)	1
  (0, 0)	1
  (0, 16)	1
  (0, 28)	1
  (0, 32)	1
  (0, 34)	1
  (0, 22)	2
  (0, 20)	1
  (0, 13)	1
  (0, 18)	1
  (0, 14)	1
  (0, 6)	1
  (0, 8)	1
  (0, 15)	1
  (0, 17)	2
  (0, 29)	2
  (0, 12)	1
  (0, 21)	1
  (0, 3)	1
  (0, 10)	1
  (0, 23)	2
  (0, 31)	4
  (0, 26)	2
  (0, 25)	1
  (0, 35)	3
  (0, 38)	1
  (0, 39)	2
  (0, 19)	2
  (0, 2)	1
  (0, 11)	1
  (0, 7)	1
  (0, 24)	1
[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 2 2 1 1 2 1 1 2 1 4 1 1 1 3
  1 1 1 2]]


We can access the mapping from vector position to feature names via `get_feature_names()`:

In [11]:
print(small_vectorizer.get_feature_names())

['always', 'among', 'and', 'at', 'been', 'car', 'cars', 'change', 'cheaper', 'cheapest', 'continually', 'daily', 'different', 'don', 'elsewhere', 'found', 'has', 'have', 'however', 'if', 'lot', 'many', 'of', 'price', 'prices', 'really', 'research', 'reserve', 'site', 'sites', 'ten', 'the', 'this', 'three', 'time', 'to', 'top', 'use', 'want', 'you']


The inverse (the mapping from feature names to vector positions) is encoded as a list in `vocabulary_`:

In [12]:
print(small_vectorizer.vocabulary_)

{'prices': 24, 'change': 7, 'daily': 11, 'and': 2, 'if': 19, 'you': 39, 'want': 38, 'to': 35, 'really': 25, 'research': 26, 'the': 31, 'price': 23, 'continually': 10, 'at': 3, 'many': 21, 'different': 12, 'sites': 29, 'have': 17, 'found': 15, 'cheaper': 8, 'cars': 6, 'elsewhere': 14, 'however': 18, 'don': 13, 'lot': 20, 'of': 22, 'time': 34, 'this': 32, 'site': 28, 'has': 16, 'always': 0, 'been': 4, 'among': 1, 'top': 36, 'three': 33, 'cheapest': 9, 'ten': 30, 'use': 37, 'reserve': 27, 'car': 5}


## Terminology 

![](../../material/pics/matrix.pdf)

Let's redo this for the entire corpus:

In [13]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.001, max_df=0.75, stop_words='english')

X = vectorizer.fit_transform(documents)

print(X.shape)

(117075, 3673)


## Exercise

Use vector operations to find out 
- what the 5 most frequent words are in `X`
- in how many different documents the word `delivery` occurs
- what percentage of the overall corpus that number corresponds to

In [35]:
# your code here
most_freq_words = X.sum(axis=0).argsort()[0, -5:].tolist()[0]
[vectorizer.get_feature_names()[w] for w in most_freq_words]

['order', 'time', 'delivery', '00', 'service']

## Character $n$-grams

We can also use characters to analyze text:

In [36]:
char_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 6), min_df=1, max_df=0.75)

C = char_vectorizer.fit_transform(documents[:10])
C

<10x8054 sparse matrix of type '<class 'numpy.int64'>'
	with 10806 stored elements in Compressed Sparse Row format>

In [37]:
print(char_vectorizer.vocabulary_)

{'pr': 5953, 'ic': 4050, 'ce': 2121, 'ch': 2155, 'ng': 5194, 'ge': 3612, ' d': 382, 'da': 2407, 'ai': 1609, 'il': 4153, 'ly': 4732, 'if': 4118, 'f ': 3378, ' y': 1264, 'yo': 8014, 'ou': 5723, 'u ': 7367, 'wa': 7716, 'nt': 5298, 'ea': 2786, 'ar': 1819, 'rc': 6141, 'ti': 7183, 'nu': 5338, 'ua': 7387, 'at': 1904, 'ny': 5348, 'di': 2471, 'ff': 3452, 'fe': 3434, 'en': 3045, 'si': 6696, 'it': 4361, 'te': 7022, ' ,': 51, ', ': 1337, 'i ': 3975, 'av': 1959, 'fo': 3495, 'un': 7438, 'ap': 1805, 'pe': 5880, 'ca': 2106, 'rs': 6380, ' e': 430, 'el': 2985, 'ls': 4709, 'ew': 3324, 'wh': 7773, '. ': 1395, 'ho': 3918, 'ow': 5794, 'we': 7736, 'ev': 3306, 'do': 2505, " '": 17, "' ": 1295, 'a ': 1501, ' l': 690, 'lo': 4679, 'ot': 5699, ' o': 785, 'of': 5474, 'im': 4177, 'hi': 3896, 'as': 1867, 'lw': 4727, 'ay': 1987, 'ys': 8040, ' b': 296, 'be': 2021, 'ee': 2919, 'am': 1680, 'mo': 4891, 'g ': 3554, 'op': 5653, 'p ': 5817, 'hr': 3943, ' (': 34, '( ': 1318, ' g': 524, '.,': 1449, ' )': 42, ') ': 1327, ' u':

## Syntactic $n$-grams

In [39]:
import spacy
nlp = spacy.load('en')
features = [' '.join(["{}_{}".format(c.lemma_, c.head.lemma_) 
                      for c in nlp(sentence)])
            for sentence in documents[:100]]

syntax_vectorizer = CountVectorizer()
X = syntax_vectorizer.fit_transform(features)

In [40]:
print(syntax_vectorizer.vocabulary_)

{'price_change': 2968, 'change_change': 1220, 'daily_change': 1358, 'and_change': 699, 'if_want': 2154, 'pron': 3034, '_want': 400, 'want_find': 4167, 'to_research': 3946, 'really_research': 3088, 'research_want': 3153, 'the_price': 3737, 'price_research': 2980, 'continually_research': 1316, 'at_research': 902, 'many_site': 2432, 'different_site': 1442, 'site_at': 3376, '_find': 188, 'have_find': 2030, 'find_change': 1709, 'cheap_car': 1231, 'car_find': 1199, 'elsewhere_find': 1548, 'however_have': 2112, '_have': 212, 'if_don': 2148, '_don': 157, 'don_have': 1500, 't_have': 3550, 'have_be': 2022, 'a_lot': 461, 'lot_have': 2383, 'of_lot': 2659, 'time_of': 3860, 'research_lot': 3152, '_be': 93, 'this_site': 3827, 'site_be': 3377, 'always_be': 647, 'be_be': 954, 'among_be': 663, 'the_three': 3778, 'top_three': 3993, 'three_among': 3839, '_e': 160, 'e_g': 1517, '_g': 199, 'g_g': 1879, 'cheap_g': 1233, '_cheap': 116, 'of_cheap': 2643, 'the_site': 3762, 'ten_site': 3586, 'site_of': 3380, '_u

# Dense Distributed Representations

## Word embeddings with `Word2vec`

In [41]:
from gensim.models import Word2Vec
from gensim.models.word2vec import FAST_VERSION

corpus = [document.split() for document in documents]
# initialize model
w2v_model = Word2Vec(size=100, 
                     window=15, 
                     hs=0,
                     sample=0.000001,
                     negative=5, 
                     min_count=100,
                     workers=-1, 
                     iter=100
)

w2v_model.build_vocab(corpus)

w2v_model.train(corpus, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)


(0, 0)

Now, we can use the embeddings of the model

In [47]:
w2v_model.wv['delivery']

array([-3.8216405e-03, -9.6471416e-04,  1.8576628e-03, -8.9886301e-04,
       -1.4340776e-03,  3.3679456e-03,  3.1864131e-03, -3.5410854e-03,
       -4.3218071e-03,  3.8888969e-03,  3.1374877e-03,  8.3776255e-04,
       -4.2235651e-03, -8.7175716e-04, -2.3794929e-03,  2.5215098e-03,
       -4.8746793e-03, -1.4411252e-03, -2.2951297e-03, -4.5871311e-03,
       -4.1621746e-04, -4.2136229e-04,  1.5828571e-03, -4.9697338e-03,
       -1.7614230e-03,  3.1707555e-03, -5.1339852e-05, -6.3822244e-04,
       -2.5855285e-05,  4.5799133e-03, -3.7941795e-03, -1.2655525e-03,
        4.2757452e-03,  4.4429721e-03,  1.4648896e-04,  1.3269613e-03,
       -2.9917513e-03, -4.6256827e-03, -2.9057837e-03,  3.3810834e-04,
       -3.2651729e-03,  4.9557034e-03, -3.3677784e-03,  2.5566041e-03,
        1.3534208e-03, -3.6331678e-03, -1.1628962e-03, -2.8475393e-03,
        7.3542923e-04,  4.4942978e-03,  2.3295432e-03, -2.4741509e-03,
       -1.4409486e-03,  3.6317206e-03,  2.6548465e-03,  4.1404516e-03,
      

In [52]:
# birthday - present + husband => birthday:present as husband:?
w2v_model.wv.most_similar(positive=['birthday', 'husband'], negative=['present'], topn=3)

[('describe', 0.35559749603271484),
 ('0pm', 0.31954193115234375),
 ('afraid', 0.3050075173377991)]

In [56]:
word1 = "Cheapest"
word2 = "friendly"

# retrieve the actual vector
# print(w2v_model.wv[word1])

# compare
print(w2v_model.wv.similarity(word1, word2))

# get the 3 most similar words
print(w2v_model.wv.most_similar(word1, topn=3))


-0.015650862232188242
[('wont', 0.33121049404144287), ('IN', 0.3292524218559265), ('ever', 0.3286404013633728)]



### Exercise
Use `spacy` to restrict the words in the tweets to *content words*, i.e., nouns, verbs, and adjectives. Transform the words to lower case and add the POS with an underderscore. E.g.:

`love_VERB old-fashioneds_NOUN`

This also allows us to distinguish between homographs, i.e., words that are written the same, but belong to different word classes, e.g., *love* in "I **love** old-fashioneds" vs. "He felt so sick, it must have been **love**".


Make sure to exclude sentences that contain none of the above.

Write the resulting corpus to a variable called `word_corpus`.

In [None]:
# Your code here


Rerun the `Word2vec` model from above on the new data set and test the words out

In [None]:
# Your code here

## Exercise

Train 4 more `Word2vec` models and average the resulting embedding matrices.

In [None]:
# Your code here

## Document embeddings with `Doc2Vec`

In [57]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import FAST_VERSION
from gensim.models.doc2vec import TaggedDocument

corpus = []
# for docid, document in enumerate(documents):
#     corpus.append(TaggedDocument(document.split(), tags=["{0:0>4}".format(docid)]))
for row in df.iterrows():
    label = row[1].score
    text = row[1].text
    corpus.append(TaggedDocument(text.split(), tags=[str(label)]))

print('done')
d2v_model = Doc2Vec(vector_size=100, 
                    window=15,
                    hs=0,
                    sample=0.000001,
                    negative=5,
                    min_count=100,
                    workers=-1,
                    epochs=500,
                    dm=0, 
                    dbow_words=1)

d2v_model.build_vocab(corpus)

d2v_model.train(corpus, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

done


We can now look at the elements

In [58]:
d2v_model.docvecs.doctags

{'5': Doctag(offset=0, word_count=4937143, doc_count=92300),
 '4': Doctag(offset=1, word_count=694898, doc_count=10542),
 '1': Doctag(offset=2, word_count=1435268, doc_count=8694),
 '2': Doctag(offset=3, word_count=357466, doc_count=2617),
 '3': Doctag(offset=4, word_count=298913, doc_count=2922)}

In [59]:
target_doc = '1'

similar_docs = d2v_model.docvecs.most_similar(target_doc, topn=5)
print(similar_docs)

[('4', 0.04224696010351181), ('3', 0.011510297656059265), ('5', -0.07985585927963257), ('2', -0.08542491495609283)]


## Exercise

What are the 10 most similar ***words*** to each category?

In [60]:
# your code here
d2v_model.wv.most_similar([d2v_model.docvecs['1']])

[('argos', 0.46029937267303467),
 ('Definitely', 0.32537201046943665),
 ('lock', 0.3130253553390503),
 ('buyers', 0.3006698489189148),
 ('losing', 0.299980103969574),
 ('own', 0.2994605600833893),
 ('good', 0.2925117611885071),
 ('works', 0.29202544689178467),
 ('owned', 0.28484654426574707),
 ('giving', 0.2829483449459076)]