# Beyond counting words: Working with word embeddings

Workshop by Damian Trilling

This notebook illustrates how we can use embeddings in Machine Learning tasks.

As always, we first import neccesary modules. We also get our data.

In [1]:
#!pip install embeddingvectorizer    # you need to install this module

In [2]:
# Supervised text classification

from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding, LSTM, GlobalMaxPooling1D
from keras.metrics import Precision, Recall
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

# general
import numpy as np
import re
# word embedding stuff
import gensim
import gensim.downloader as api
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
from gensim.corpora import Dictionary

# data
from courseutils import get_review_data

# lets get more output
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
# get data
reviews_train, reviews_test, y_train, y_test = get_review_data()

reviews_train, y_train = shuffle(reviews_train, y_train, random_state=42)
reviews_test, y_test = shuffle(reviews_test, y_test, random_state=42)

# get word embedding model

# pretrained:
# wv = api.load('word2vec-google-news-300')
wv = api.load("glove-wiki-gigaword-300")

# or our own:
#wv = gensim.models.Word2Vec.load("mymodel").wv

Using cached file reviewdata.pickle.bz2


2021-04-13 13:18:40,570 : INFO : loading projection weights from /home/damian/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz
2021-04-13 13:21:54,742 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from /home/damian/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2021-04-13T13:21:54.729984', 'gensim': '4.0.1', 'python': '3.8.5 (default, Jan 27 2021, 15:41:15) \n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-70-generic-x86_64-with-glibc2.29', 'event': 'load_word2vec_format'}


In [9]:
api.info(name_only=True)['models']

['fasttext-wiki-news-subwords-300',
 'conceptnet-numberbatch-17-06-300',
 'word2vec-ruscorpora-300',
 'word2vec-google-news-300',
 'glove-wiki-gigaword-50',
 'glove-wiki-gigaword-100',
 'glove-wiki-gigaword-200',
 'glove-wiki-gigaword-300',
 'glove-twitter-25',
 'glove-twitter-50',
 'glove-twitter-100',
 'glove-twitter-200',
 '__testing_word2vec-matrix-synopsis']

In [12]:
len(reviews_train) == len(y_train)

True

In [13]:
type(reviews_train)

list

In [14]:
reviews_train[:3]

['Dumb is as dumb does, in this thoroughly uninteresting, supposed black comedy. Essentially what starts out as Chris Klein trying to maintain a low profile, eventually morphs into an uninspired version of "The Three Amigos", only without any laughs. In order for black comedy to work, it must be outrageous, which "Play Dead" is not. In order for black comedy to work, it cannot be mean spirited, which "Play Dead" is. What "Play Dead" really is, is a town full of nut jobs. Fred Dunst does however do a pretty fair imitation of Billy Bob Thornton\'s character from "A Simple Plan", while Jake Busey does a pretty fair imitation of, well, Jake Busey. - MERK',
 "I dug out from my garage some old musicals and this is another one of my favorites. It was written by Jay Alan Lerner and directed by Vincent Minelli. It won two Academy Awards for Best Picture of 1951 and Best Screenplay. The story of an American painter in Paris who tries to make it big. Nina Foch is a sophisticated lady of means and

In [16]:
y_train[:20]

['neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg']

# Keras

## A simple neural network

In [19]:
VALIDATION_SIZE = 2500
np.random.seed(666)



In [18]:
def encodeY(Y):
    '''create one-hot (dummies) for output, see also https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/
    encode class values as integers
    '''
    encoder = LabelEncoder()
    encoder.fit(Y)
    encoded_Y = encoder.transform(Y)
    dummy_y = np_utils.to_categorical(encoded_Y)
    return dummy_y

In [7]:
encodeY(['aa','bb','aa','cc','aa','cc'])

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.]], dtype=float32)

In [20]:
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(reviews_train)
X_test = vectorizer.transform(reviews_test)
X_test.sort_indices()
X_train.sort_indices()

input_dim = X_train.shape[1]  # Number of features

y_train_int = encodeY(y_train)[:,0]
y_test_int = encodeY(y_test)[:,0]

numberoflabels = 1

In [22]:
input_dim

74538

In [23]:
model = Sequential()
model.add(Dense(300, input_dim=input_dim, activation='relu'))
#model.add(layers.Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
           optimizer='adam', 
            metrics=['accuracy', Precision(), Recall()])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 300)               22361700  
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 301       
Total params: 22,362,001
Trainable params: 22,362,001
Non-trainable params: 0
_________________________________________________________________


In [24]:
history = model.fit(X_train[:-VALIDATION_SIZE], y_train_int[:-VALIDATION_SIZE],
                     epochs=5,
                     verbose=True,
                     validation_data=(X_train[-VALIDATION_SIZE:], y_train_int[-VALIDATION_SIZE:]))

_, acc, precision, recall = model.evaluate(X_test, y_test_int)
print(f"Accuracy: {acc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.85, Precision: 0.85, Recall: 0.86


## A model with a second layer

In [25]:
model = Sequential()
model.add(Dense(300, input_dim=input_dim, activation='relu'))
model.add(Dense(300, activation='relu'))
model.add(Dense(300, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
           optimizer='adam', 
            metrics=['accuracy', Precision(), Recall()])
model.summary()

history = model.fit(X_train[:-VALIDATION_SIZE], y_train_int[:-VALIDATION_SIZE],
                     epochs=5,
                     verbose=True,
                     validation_data=(X_train[-VALIDATION_SIZE:], y_train_int[-VALIDATION_SIZE:]))

_, acc, precision, recall = model.evaluate(X_test, y_test_int)
print(f"Accuracy: {acc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 300)               22361700  
_________________________________________________________________
dense_3 (Dense)              (None, 300)               90300     
_________________________________________________________________
dense_4 (Dense)              (None, 300)               90300     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 301       
Total params: 22,542,601
Trainable params: 22,542,601
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.85, Precision: 0.83, Recall: 0.87


In [26]:
model = Sequential()
model.add(Dense(300, input_dim=input_dim, activation='relu'))
model.add(Dense(300, activation='relu'))
model.add(Dense(300, activation='relu'))
model.add(Dense(300, activation='relu'))

model.add(Dense(300, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
           optimizer='adam', 
            metrics=['accuracy', Precision(), Recall()])
model.summary()

history = model.fit(X_train[:-VALIDATION_SIZE], y_train_int[:-VALIDATION_SIZE],
                     epochs=5,
                     verbose=True,
                     validation_data=(X_train[-VALIDATION_SIZE:], y_train_int[-VALIDATION_SIZE:]))

_, acc, precision, recall = model.evaluate(X_test, y_test_int)
print(f"Accuracy: {acc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 300)               22361700  
_________________________________________________________________
dense_7 (Dense)              (None, 300)               90300     
_________________________________________________________________
dense_8 (Dense)              (None, 300)               90300     
_________________________________________________________________
dense_9 (Dense)              (None, 300)               90300     
_________________________________________________________________
dense_10 (Dense)             (None, 300)               90300     
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 301       
Total params: 22,723,201
Trainable params: 22,723,201
Non-trainable params: 0
__________________________________________

## Convolutional Network

In [27]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

embedding_dim = 300

# Tokenize words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(reviews_train)
X_train = tokenizer.texts_to_sequences(reviews_train)
X_test = tokenizer.texts_to_sequences(reviews_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
maxlen = len(max(X_train, key=len)) # never truncate -- alternatively, set max length to lower value 
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [28]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(Conv1D(embedding_dim, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(300, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',  Precision(), Recall()])
print(model.summary())


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1973, 300)         26574900  
_________________________________________________________________
conv1d (Conv1D)              (None, 1969, 300)         450300    
_________________________________________________________________
global_max_pooling1d (Global (None, 300)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 300)               90300     
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 301       
Total params: 27,115,801
Trainable params: 27,115,801
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
history = model.fit(X_train[:-VALIDATION_SIZE], y_train_int[:-VALIDATION_SIZE], 
          epochs=3, verbose=True,
          validation_data=(X_train[-VALIDATION_SIZE:], y_train_int[-VALIDATION_SIZE:]))

Epoch 1/3
Epoch 2/3
Epoch 3/3

In [None]:
_, acc, precision, recall = model.evaluate(X_test, y_test_int)
print(f"Accuracy: {acc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")

## Pretrained embeddings

In [20]:
embedding_layer = wv.get_keras_embedding(train_embeddings=False)
input_dim = (len(X_train[:-VALIDATION_SIZE]), 300)

In [21]:
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(embedding_dim, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(300, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',  Precision(), Recall()])
print(model.summary())

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 300)         120000000 
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 300)         450300    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 300)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 300)               90300     
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 301       
Total params: 120,540,901
Trainable params: 540,901
Non-trainable params: 120,000,000
_________________________________________________________________
None


In [22]:
history = model.fit(X_train[:-VALIDATION_SIZE], y_train_int[:-VALIDATION_SIZE], 
          epochs=5, verbose=True,
          validation_data=(X_train[-VALIDATION_SIZE:], y_train_int[-VALIDATION_SIZE:]))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
_, acc, precision, recall = model.evaluate(X_test, y_test_int)
print(f"Accuracy: {acc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")

Accuracy: 0.82, Precision: 0.91, Recall: 0.72


In [None]:
# too much memory requirements below

In [None]:
'''
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(35))
#model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',  Precision(), Recall()])
print(model.summary())
'''

## LSTM

More examples: https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(LSTM(100))
model.add(Dense(units=1, activation='sigmoid'))
 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Recall(), Precision()])
print(model.summary())



In [None]:
model.fit(X_train[:-VALIDATION_SIZE], y_train_int[:-VALIDATION_SIZE], 
          epochs=3, batch_size=128, verbose=1, 
          validation_data=(X_train[-VALIDATION_SIZE:], y_train_int[-VALIDATION_SIZE:]))

In [None]:
_, acc, precision, recall = model.evaluate(X_test, y_test_int)
print(f"Accuracy: {acc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")

In [None]:
'''
history = model.fit(X_train[:-VALIDATION_SIZE], y_train_int[:-VALIDATION_SIZE], 
          epochs=5, verbose=True,
          validation_data=(X_train[-VALIDATION_SIZE:], y_train_int[-VALIDATION_SIZE:]))
'''