# Exercise 2

This notebook illustrates how we can use embeddings in Machine Learning tasks.

As always, we first import neccesary modules. We also get our data.

In [1]:
#!pip install embeddingvectorizer    # you need to install this module

In [2]:
# Supervised text classification
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils import shuffle
from sklearn import metrics
import joblib
import eli5
from nltk.sentiment import vader

from embeddingvectorizer import EmbeddingCountVectorizer, EmbeddingTfidfVectorizer
import embeddingvectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier


# general
import numpy as np
import re
# word embedding stuff
import gensim
import gensim.downloader as api
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
from gensim.corpora import Dictionary
# from gensim.models import WordEmbeddingSimilarityIndex

# data
from courseutils import get_review_data

# lets get more output
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [3]:
gensim.__version__

'4.0.1'

In [4]:
# get data
reviews_train, reviews_test, y_train, y_test = get_review_data()

reviews_train, y_train = shuffle(reviews_train, y_train, random_state=42)
reviews_test, y_test = shuffle(reviews_test, y_test, random_state=42)

# get word embedding model

# pretrained:
# wv = api.load('word2vec-google-news-300')
# wv = api.load("glove-wiki-gigaword-300")

# or our own:
wv = gensim.models.Word2Vec.load("mymodel").wv

Using cached file reviewdata.pickle.bz2


2021-04-29 15:37:38,573 : INFO : loading Word2Vec object from mymodel
2021-04-29 15:37:38,589 : INFO : loading wv recursively from mymodel.wv.* with mmap=None
2021-04-29 15:37:38,590 : INFO : loading vectors from mymodel.wv.vectors.npy with mmap=None
2021-04-29 15:37:38,615 : INFO : loading syn1neg from mymodel.syn1neg.npy with mmap=None
2021-04-29 15:37:38,634 : INFO : setting ignored attribute cum_table to None
2021-04-29 15:37:39,186 : INFO : Word2Vec lifecycle event {'fname': 'mymodel', 'datetime': '2021-04-29T15:37:39.177763', 'gensim': '4.0.1', 'python': '3.8.5 (default, Jan 27 2021, 15:41:15) \n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-72-generic-x86_64-with-glibc2.29', 'event': 'loaded'}


In [5]:
# explore data here

# Task 1: Document similarities

In [6]:
termsim_index = WordEmbeddingSimilarityIndex(wv)
documents = [re.split(r"\W",e.lower()) for e in reviews_train[:100]]

In [7]:
id2word = Dictionary(documents)
bow_corpus = [id2word.doc2bow(document) for document in documents]
similarity_matrix = SparseTermSimilarityMatrix(termsim_index, id2word)  # construct similarity matrix
docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10)

2021-04-29 15:37:39,216 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-04-29 15:37:39,236 : INFO : built Dictionary(4674 unique tokens: ['', 'a', 'amigos', 'an', 'any']...) from 100 documents (total 27348 corpus positions)
2021-04-29 15:37:39,238 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(4674 unique tokens: ['', 'a', 'amigos', 'an', 'any']...) from 100 documents (total 27348 corpus positions)", 'datetime': '2021-04-29T15:37:39.237959', 'gensim': '4.0.1', 'python': '3.8.5 (default, Jan 27 2021, 15:41:15) \n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-72-generic-x86_64-with-glibc2.29', 'event': 'created'}
2021-04-29 15:37:39,255 : INFO : constructing a sparse term similarity matrix using WordEmbeddingSimilarityIndex(keyedvectors=<gensim.models.keyedvectors.KeyedVectors object at 0x7ff0baa94ca0>, threshold=0.0, exponent=2.0, kwargs={})
2021-04-29 15:37:39,256 : INFO : iterating over columns in dictionary order
100%|██████████| 4674/4674 [00:33<00:00, 139

In [8]:
query = re.split("\W",'''Pulp Fiction may be the single best film ever made, and quite appropriately it is by one of the most 
creative directors of all time, Quentin Tarantino. This movie is amazing from the beginning definition of pulp to
the end credits and boasts one of the best casts ever assembled with the likes of Bruce Willis, Samuel L. Jackson, 
John Travolta, Uma Thurman, Harvey Keitel, Tim Roth and Christopher Walken. The dialog is surprisingly humorous for
this type of film, and I think that's what has made it so successful. Wrongfully denied the many Oscars it was 
nominated for, Pulp Fiction is by far the best film of the 90s and no Tarantino film has surpassed the quality of
this movie (although Kill Bill came close). As far as I'm concerned this is the top film of all-time and definitely 
deserves a watch if you haven't seen it.
'''.lower())
sims = docsim_index[id2word.doc2bow(query)]                                                                

In [9]:
# or let's take a  the first, second, or whatever docuemnt itself

docindex = 2

sims = docsim_index[id2word.doc2bow(documents[docindex])]      

In [10]:
# check wether everything's ok
" ".join(documents[docindex]), reviews_train[docindex]

('after watching this movie i was honestly disappointed   not because of the actors  story or directing   i was disappointed by this film advertisements  br    br   the trailers were suggesting that the battalion  have chosen the third way out  other than surrender or die  polish infos were even misguiding that they had the choice between being killed by own artillery or german guns  they even translated the title wrong as  misplaced battalion    this have tickled the right spot and i bought the movie  br    br   the disappointment started when i realized that the third way is to just sit down and count dead bodies followed by sitting down and counting dead bodies    then i began to think  hey  this story can t be that simple    i bet this clever officer will find some cunning way to save what left of his troops   well  he didn t  they were just sitting and waiting for something to happen  and so was i  br    br   the story was based on real events of world war i  so the writers couldn

In [11]:
for index, similarity in sims:
    print(f"This review has a similarity of {similarity} with our query:")
    print(reviews_train[index][:1000])
    print("\n*************************************************************\n")

This review has a similarity of 1.0 with our query:
After watching this movie I was honestly disappointed - not because of the actors, story or directing - I was disappointed by this film advertisements.<br /><br />The trailers were suggesting that the battalion "have chosen the third way out" other than surrender or die (Polish infos were even misguiding that they had the choice between being killed by own artillery or German guns, they even translated the title wrong as "misplaced battalion"). This have tickled the right spot and I bought the movie.<br /><br />The disappointment started when I realized that the third way is to just sit down and count dead bodies followed by sitting down and counting dead bodies... Then I began to think "hey, this story can't be that simple... I bet this clever officer will find some cunning way to save what left of his troops". Well, he didn't, they were just sitting and waiting for something to happen. And so was I.<br /><br />The story was based on

## Alternative: Word mover's distance

We can also get the WMD instead of the cosine similarity. However, you'd need to think of a efficient way to make use of it:

In [76]:
for i in range(10):
    print(wv.wmdistance(query, reviews_train[i].split()))

2021-04-30 14:05:27,766 : INFO : Removed 22 and 45 OOV words from document 1 and 2 (respectively).
2021-04-30 14:05:27,774 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-04-30 14:05:27,779 : INFO : built Dictionary(133 unique tokens: ['90s', 'a', 'all', 'although', 'amazing']...) from 2 documents (total 222 corpus positions)
2021-04-30 14:05:27,780 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(133 unique tokens: ['90s', 'a', 'all', 'although', 'amazing']...) from 2 documents (total 222 corpus positions)", 'datetime': '2021-04-30T14:05:27.780605', 'gensim': '4.0.1', 'python': '3.8.5 (default, Jan 27 2021, 15:41:15) \n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-72-generic-x86_64-with-glibc2.29', 'event': 'created'}
2021-04-30 14:05:27,923 : INFO : Removed 22 and 47 OOV words from document 1 and 2 (respectively).
2021-04-30 14:05:27,924 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-04-30 14:05:27,925 : INFO : built Dictionary(153 uniqu

0.8370043332301433
0.6982933622099716


2021-04-30 14:05:28,348 : INFO : Removed 22 and 39 OOV words from document 1 and 2 (respectively).
2021-04-30 14:05:28,349 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-04-30 14:05:28,352 : INFO : built Dictionary(148 unique tokens: ['90s', 'a', 'all', 'although', 'amazing']...) from 2 documents (total 262 corpus positions)
2021-04-30 14:05:28,353 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(148 unique tokens: ['90s', 'a', 'all', 'although', 'amazing']...) from 2 documents (total 262 corpus positions)", 'datetime': '2021-04-30T14:05:28.353353', 'gensim': '4.0.1', 'python': '3.8.5 (default, Jan 27 2021, 15:41:15) \n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-72-generic-x86_64-with-glibc2.29', 'event': 'created'}
2021-04-30 14:05:28,507 : INFO : Removed 22 and 65 OOV words from document 1 and 2 (respectively).
2021-04-30 14:05:28,508 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-04-30 14:05:28,509 : INFO : built Dictionary(225 uniqu

0.7184615715184782
0.7830858695967454


2021-04-30 14:05:29,163 : INFO : Removed 22 and 44 OOV words from document 1 and 2 (respectively).
2021-04-30 14:05:29,164 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-04-30 14:05:29,165 : INFO : built Dictionary(161 unique tokens: ['90s', 'a', 'all', 'although', 'amazing']...) from 2 documents (total 291 corpus positions)
2021-04-30 14:05:29,166 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(161 unique tokens: ['90s', 'a', 'all', 'although', 'amazing']...) from 2 documents (total 291 corpus positions)", 'datetime': '2021-04-30T14:05:29.166293', 'gensim': '4.0.1', 'python': '3.8.5 (default, Jan 27 2021, 15:41:15) \n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-72-generic-x86_64-with-glibc2.29', 'event': 'created'}


0.6588292123721998


2021-04-30 14:05:29,366 : INFO : Removed 22 and 169 OOV words from document 1 and 2 (respectively).
2021-04-30 14:05:29,367 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-04-30 14:05:29,369 : INFO : built Dictionary(328 unique tokens: ['90s', 'a', 'all', 'although', 'amazing']...) from 2 documents (total 614 corpus positions)
2021-04-30 14:05:29,375 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(328 unique tokens: ['90s', 'a', 'all', 'although', 'amazing']...) from 2 documents (total 614 corpus positions)", 'datetime': '2021-04-30T14:05:29.375497', 'gensim': '4.0.1', 'python': '3.8.5 (default, Jan 27 2021, 15:41:15) \n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-72-generic-x86_64-with-glibc2.29', 'event': 'created'}


0.6995978434767701


2021-04-30 14:05:30,203 : INFO : Removed 22 and 33 OOV words from document 1 and 2 (respectively).
2021-04-30 14:05:30,204 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-04-30 14:05:30,205 : INFO : built Dictionary(142 unique tokens: ['90s', 'a', 'all', 'although', 'amazing']...) from 2 documents (total 248 corpus positions)
2021-04-30 14:05:30,205 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(142 unique tokens: ['90s', 'a', 'all', 'although', 'amazing']...) from 2 documents (total 248 corpus positions)", 'datetime': '2021-04-30T14:05:30.205893', 'gensim': '4.0.1', 'python': '3.8.5 (default, Jan 27 2021, 15:41:15) \n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-72-generic-x86_64-with-glibc2.29', 'event': 'created'}
2021-04-30 14:05:30,345 : INFO : Removed 22 and 56 OOV words from document 1 and 2 (respectively).
2021-04-30 14:05:30,346 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-04-30 14:05:30,347 : INFO : built Dictionary(161 uniqu

0.7086313638713535
0.6600067038261627


2021-04-30 14:05:30,558 : INFO : Removed 22 and 45 OOV words from document 1 and 2 (respectively).
2021-04-30 14:05:30,559 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-04-30 14:05:30,560 : INFO : built Dictionary(174 unique tokens: ['90s', 'a', 'all', 'although', 'amazing']...) from 2 documents (total 285 corpus positions)
2021-04-30 14:05:30,561 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(174 unique tokens: ['90s', 'a', 'all', 'although', 'amazing']...) from 2 documents (total 285 corpus positions)", 'datetime': '2021-04-30T14:05:30.561458', 'gensim': '4.0.1', 'python': '3.8.5 (default, Jan 27 2021, 15:41:15) \n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-72-generic-x86_64-with-glibc2.29', 'event': 'created'}


0.6443568344997984
0.7236031977134516


# Task 2: Supervised Machine Learning

## A classical model

In [12]:
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(reviews_train)
X_test = vectorizer.transform(reviews_test)

logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.85      0.87      0.86     12500
         pos       0.87      0.85      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



### Let's discuss

- what happened here under the hood?
- How many features do we have?
- How does X_train "look" like?

**write your conclusions here**

In [13]:
X_train

<25000x74538 sparse matrix of type '<class 'numpy.int64'>'
	with 2241793 stored elements in Compressed Sparse Row format>

Let's rewrite this into a pipeline (for easier use), and let's use a TfIDF vectorizer instead. This is probably as good as it can get.

In [14]:
traditionalpipe = Pipeline([('vectorizer', CountVectorizer(stop_words='english')),
                    ('logreg',LogisticRegression(solver='liblinear'))])

traditionalpipe.fit(reviews_train, y_train)
y_pred = traditionalpipe.predict(reviews_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.85      0.87      0.86     12500
         pos       0.87      0.85      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



**It's not the topic of today, but once we have such a pipeline, we can use a so-called gridsearch to find the optimal settings. For more info, see https://github.com/damian0604/bdaca/blob/master/12ec/week10/lecture10.pdf**

## Let's use embeddings as input instead

In [15]:
# MAKE SURE THAT YOU KNOW WHICH MODEL YOU ARE WORKING ON - can use either self-trained or pre-trained model

# we need to convert `wv` to a slightliy different format:
w2vmodel = dict(zip(wv.index_to_key, wv.vectors))

In [42]:
mypipe = Pipeline([('vectorizer', embeddingvectorizer.EmbeddingCountVectorizer(w2vmodel, operator='mean')),
                    ('svm', 
                     SGDClassifier(loss='hinge', penalty='l2', tol=1e-4, alpha=1e-6, max_iter=1000, random_state=42))])

# Generate BOW representation of word counts
mypipe.fit(reviews_train, y_train)
y_pred = mypipe.predict(reviews_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.88      0.66      0.75     12500
         pos       0.73      0.91      0.81     12500

    accuracy                           0.78     25000
   macro avg       0.80      0.78      0.78     25000
weighted avg       0.80      0.78      0.78     25000



In [40]:
mypipe = Pipeline([('vectorizer', embeddingvectorizer.EmbeddingTfidfVectorizer(w2vmodel, operator='sum')),
                    ('logreg', LogisticRegression(solver='liblinear'))])

# Generate BOW representation of word counts
mypipe.fit(reviews_train, y_train)
y_pred = mypipe.predict(reviews_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.82      0.82      0.82     12500
         pos       0.82      0.82      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000



In [18]:
mypipe = Pipeline([('vectorizer', embeddingvectorizer.EmbeddingCountVectorizer(w2vmodel, operator='sum')),
                    ('logreg', LogisticRegression(solver='liblinear'))])

# Generate BOW representation of word counts
mypipe.fit(reviews_train, y_train)
y_pred = mypipe.predict(reviews_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.83      0.83      0.83     12500
         pos       0.83      0.83      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



### Let's discuss

- what happened here under the hood?
- How many features do we have?
- How does the input matrix "look" like?

**write your conclusions here**

In [63]:
# some illustration
fittedvec = EmbeddingCountVectorizer(w2vmodel, operator='sum').fit(reviews_train)
fittedvec.transform(["This is a test.", "And another one"]).shape

(2, 300)