# Beyond counting words: Working with word embeddings

Workshop by Damian Trilling

This notebook illustrates how we can use embeddings in Machine Learning tasks.

As always, we first import neccesary modules. We also get our data.

In [1]:
# getting data
from pathlib import Path
import tarfile
import bz2
import urllib.request
import re
import pickle
import requests
from nltk.tokenize import TreebankWordTokenizer

# Supervised text classification
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn import metrics
import joblib
import eli5
from nltk.sentiment import vader

# general
import numpy as np

# word embedding stuff
import gensim.downloader as api



In [2]:
# function I wrote for you to easily get example dataset

def get_review_data(filename = "reviewdata.pickle.bz2", url = "http://cssbook.net/d/aclImdb_v1.tar.gz"):
    '''
    Checks whether review dataset has already been downloaded.
    If not, downloads it.
    
    Parameters
    ----------
    filename : string
        name of cached file
    url : string
        url of IMDB dataset
    
    Returns
    -------
    tuple of lists of strings
        reviews_train, reviews_test, label_train, label_test
    '''

    if Path(filename).exists():
        print(f"Using cached file {filename}")
        with bz2.BZ2File(filename, 'r') as f:
            reviews_train, reviews_test, label_train, label_test = pickle.load(f)
    else:
        print(f"Downloading from {url}")
        fn, _headers = urllib.request.urlretrieve(url, filename=None)
        t = tarfile.open(fn, mode="r:gz")
        reviews_train, reviews_test, label_train, label_test = [], [], [], []
        for file in t.getmembers():
            try:
                _imdb, dataset, label, _fn = Path(file.name).parts
            except ValueError:
                # if the Path cannot be parsed, e.g. because it does not consist of exactly four parts, then it is not a part of the dataset but for instance a folder name. Let's skip it then
                continue
            if dataset == "train" and (label=='pos' or label=='neg'):
                reviews_train.append(t.extractfile(file).read().decode("utf-8"))
                label_train.append(label)
            elif dataset == "test" and (label=='pos' or label=='neg'):
                reviews_test.append(t.extractfile(file).read().decode("utf-8"))
                label_test.append(label)
        print(f"Saving {len(label_train)} training and {len(label_test)} test cases to {filename}")
        with bz2.BZ2File(filename, 'w') as f:
            pickle.dump((reviews_train, reviews_test, label_train, label_test), f)
    return reviews_train, reviews_test, label_train, label_test



In [17]:
# get data
reviews_train, reviews_test, y_train, y_test = get_review_data()

reviews_train, y_train = shuffle(reviews_train, y_train, random_state=42)
reviews_test, y_test = shuffle(reviews_test, y_test, random_state=42)



# get word embedding model

wv = api.load('word2vec-google-news-300')
#wv = api.load("glove-wiki-gigaword-300")

Using cached file reviewdata.pickle.bz2


In [4]:
# explore data here

In [5]:
# explore data here

## A classical model

In [6]:
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(reviews_train)
X_test = vectorizer.transform(reviews_test)

logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.85      0.87      0.86     12500
         pos       0.87      0.85      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



### Let's discuss

- what happened here under the hood?
- How many features do we have?
- How does X_train "look" like?

**write your conclusions here**

## Let's use embeddings as input instead

In [7]:
def vectorize_with_embeddings(model, texts, aggfunc=np.mean):
    '''Takes a word2vec model and a list or generator of texts as input. Yields the mean embedding for the text'''
    for text in texts:
        vectors = []
        for w in text.split():  # point for improvement: better tokenization here
            try:
                vectors.append(model[w])
            except KeyError:
                pass
        vectors = np.array(vectors)
        yield aggfunc(vectors, axis=0)
    

In [8]:
X_train = list(vectorize_with_embeddings(wv, reviews_train))
X_test = list(vectorize_with_embeddings(wv, reviews_test))

logreg2 = LogisticRegression(solver='liblinear')
logreg2.fit(X_train, y_train)

y_pred = logreg2.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.82      0.84      0.83     12500
         pos       0.83      0.82      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



### Let's discuss

- what happened here under the hood?
- How many features do we have?
- How does X_train "look" like?

**write your conclusions here**

## There's a reason why the classical approach worked so good that the embedding approach couldn't add anything.

- can you see what?

In [9]:
reviews_train_short = reviews_train[:200]
reviews_test_short = reviews_test[:200] 
y_train_short = y_train[:200] 
y_test_short = y_test[:200] 

In [10]:
vectorizer = CountVectorizer(stop_words='english')
X_train_short = vectorizer.fit_transform(reviews_train_short)
X_test_short = vectorizer.transform(reviews_test_short)

logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train_short, y_train_short)

y_pred_short = logreg.predict(X_test_short)

print(metrics.classification_report(y_test_short, y_pred_short))

              precision    recall  f1-score   support

         neg       0.72      0.72      0.72       105
         pos       0.69      0.69      0.69        95

    accuracy                           0.71       200
   macro avg       0.71      0.71      0.71       200
weighted avg       0.71      0.71      0.71       200



In [11]:
X_train_short = list(vectorize_with_embeddings(wv, reviews_train_short, np.sum))
X_test_short = list(vectorize_with_embeddings(wv, reviews_test_short, np.sum))

logreg2 = LogisticRegression(solver='liblinear')
logreg2.fit(X_train_short, y_train_short)

y_pred_short = logreg2.predict(X_test_short)

print(metrics.classification_report(y_test_short, y_pred_short))

              precision    recall  f1-score   support

         neg       0.78      0.73      0.75       105
         pos       0.72      0.77      0.74        95

    accuracy                           0.75       200
   macro avg       0.75      0.75      0.75       200
weighted avg       0.75      0.75      0.75       200



**write your conclusions here**

-- slides in between --

# Keras

## A simple neural network

In [55]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding, LSTM, GlobalMaxPooling1D
from keras.layers import Dense
from keras.metrics import Precision, Recall
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences

In [13]:
VALIDATION_SIZE = 2500
np.random.seed(666)



In [14]:
def encodeY(Y):
    '''create one-hot (dummies) for output, see also https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/
    encode class values as integers
    '''
    encoder = LabelEncoder()
    encoder.fit(Y)
    encoded_Y = encoder.transform(Y)
    dummy_y = np_utils.to_categorical(encoded_Y)
    return dummy_y

In [15]:
encodeY(['aa','bb','aa','cc','aa','cc'])

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.]], dtype=float32)

In [61]:
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(reviews_train)
X_test = vectorizer.transform(reviews_test)
X_test.sort_indices()
X_train.sort_indices()

input_dim = X_train.shape[1]  # Number of features

y_train_int = encodeY(y_train)[:,0]
y_test_int = encodeY(y_test)[:,0]

numberoflabels = 1

In [18]:
model = Sequential()
model.add(Dense(300, input_dim=input_dim, activation='relu'))
#model.add(layers.Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
           optimizer='adam', 
            metrics=['accuracy', Precision(), Recall()])
model.summary()

history = model.fit(X_train[:-VALIDATION_SIZE], y_train_int[:-VALIDATION_SIZE],
                     epochs=5,
                     verbose=True,
                     validation_data=(X_test[VALIDATION_SIZE:], y_test_int[VALIDATION_SIZE:]))

_, acc, precision, recall = model.evaluate(X_test, y_test_int)
print(f"Accuracy: {acc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 300)               22361700  
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 301       
Total params: 22,362,001
Trainable params: 22,362,001
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.85, Precision: 0.84, Recall: 0.86


In [20]:
model = Sequential()
model.add(Dense(300, input_dim=input_dim, activation='relu'))
model.add(Dense(300, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
           optimizer='adam', 
            metrics=['accuracy', Precision(), Recall()])
model.summary()

history = model.fit(X_train[:-VALIDATION_SIZE], y_train_int[:-VALIDATION_SIZE],
                     epochs=5,
                     verbose=True,
                     validation_data=(X_test[VALIDATION_SIZE:], y_test_int[VALIDATION_SIZE:]))

_, acc, precision, recall = model.evaluate(X_test, y_test_int)
print(f"Accuracy: {acc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 300)               22361700  
_________________________________________________________________
dense_6 (Dense)              (None, 300)               90300     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 301       
Total params: 22,452,301
Trainable params: 22,452,301
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.85, Precision: 0.83, Recall: 0.88


## Convolutional Network

# STH WRONG FOR EVERYTHING BELOW - ACCURACY= .5 HENCE EFFECTIVELY RANDOM GUESSING GIVEN TWO EQUAL CLASSES

In [20]:
vectorizer = CountVectorizer(min_df=5, max_df=.9)
vectorizer.fit(reviews_train)
word2idx = {word: idx for idx, word in enumerate(vectorizer.get_feature_names())}
tokenize = vectorizer.build_tokenizer()
preprocess = vectorizer.build_preprocessor()
 
def to_sequence(tokenizer, preprocessor, index, text):
    words = tokenizer(preprocessor(text))
    indexes = [index[word] for word in words if word in index]
    return indexes

X_train_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in reviews_train]

In [21]:
# Compute the max lenght of a text
MAX_SEQ_LENGTH = len(max(X_train_sequences, key=len))
print("MAX_SEQ_LENGTH=", MAX_SEQ_LENGTH)
 
N_FEATURES = len(vectorizer.get_feature_names())
X_train_sequences = pad_sequences(X_train_sequences, maxlen=MAX_SEQ_LENGTH, value=N_FEATURES)
print(X_train_sequences[0])

MAX_SEQ_LENGTH= 1863
[27267 27267 27267 ... 13044  3477 15378]


In [33]:
model = Sequential()
model.add(Embedding(len(vectorizer.get_feature_names()) + 1,
                    300,  # Embedding size
                    input_length=MAX_SEQ_LENGTH))
model.add(Conv1D(300, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Flatten())
model.add(Dense(units=300, activation='relu'))
model.add(Dense(units=numberoflabels, activation='sigmoid'))
 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',  Precision(), Recall()])
print(model.summary())


Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1863, 64)          1745152   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1859, 64)          20544     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 371, 64)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 23744)             0         
_________________________________________________________________
dense_10 (Dense)             (None, 64)                1519680   
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 65        
Total params: 3,285,441
Trainable params: 3,285,441
Non-trainable params: 0
____________________________________________

In [34]:
history = model.fit(X_train_sequences[:-VALIDATION_SIZE], y_train_int[:-VALIDATION_SIZE], 
          epochs=5, verbose=True,
          validation_data=(X_train_sequences[-VALIDATION_SIZE:], y_train_int[-VALIDATION_SIZE:]))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ff6c3f28fa0>

In [9]:
X_test_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in reviews_test]
X_test_sequences = pad_sequences(X_test_sequences, maxlen=MAX_SEQ_LENGTH, value=N_FEATURES)


In [40]:
_, acc, precision, recall = model.evaluate(X_test_sequences, y_test_int)
print(f"Accuracy: {acc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")

Accuracy: 0.50, Precision: 0.00, Recall: 0.00


## LSTM

In [None]:
model = Sequential()
model.add(Embedding(len(vectorizer.get_feature_names()) + 1,
                    64,  # Embedding size
                    input_length=MAX_SEQ_LENGHT))
model.add(LSTM(64))
model.add(Dense(units=numberoflabels, activation='sigmoid'))
 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())



In [None]:
model.fit(X_train_sequences[:-VALIDATION_SIZE], y_train[:-VALIDATION_SIZE], 
          epochs=2, batch_size=128, verbose=1, 
          validation_data=(X_train_sequences[-VALIDATION_SIZE:], y_train[-VALIDATION_SIZE:]))

In [None]:
scores = model.evaluate(X_test_sequences, y_test, verbose=1)
print("Accuracy:", scores[1])

## Pretrained embeddings

In [50]:
embedding_layer = wv.get_keras_embedding(train_embeddings=False)
input_dim = (len(X_train_sequences[:-VALIDATION_SIZE]), 300)

In [62]:
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(300, 3, padding='valid',activation='relu',strides=2))
model.add(Conv1D(150, 3, padding='valid',activation='relu',strides=2))
model.add(Conv1D(75, 3, padding='valid',activation='relu',strides=2))
model.add(GlobalMaxPooling1D())
model.add(Dense(150,activation='sigmoid'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',  Precision(), Recall()])
print(model.summary())


Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 300)         120000000 
_________________________________________________________________
conv1d_29 (Conv1D)           (None, None, 300)         270300    
_________________________________________________________________
conv1d_30 (Conv1D)           (None, None, 150)         135150    
_________________________________________________________________
conv1d_31 (Conv1D)           (None, None, 75)          33825     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 75)                0         
_________________________________________________________________
dense_23 (Dense)             (None, 150)               11400     
_________________________________________________________________
dense_24 (Dense)             (None, 1)               

In [63]:
history = model.fit(X_train_sequences[:-VALIDATION_SIZE], y_train_int[:-VALIDATION_SIZE], 
          epochs=5, verbose=True,
          validation_data=(X_train_sequences[-VALIDATION_SIZE:], y_train_int[-VALIDATION_SIZE:]))

Epoch 1/5
Epoch 2/5

KeyboardInterrupt: 

In [None]:
_, acc, precision, recall = model.evaluate(X_test_sequences, y_test_int)
print(f"Accuracy: {acc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")