In [1]:
import nltk
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.svm import LinearSVC
from gensim.models import KeyedVectors
from tqdm import tqdm

In [3]:
sentiment_data = pd.read_csv("./train.csv")

In [4]:
DATA_TO_USE = 250000

texts = sentiment_data['message'].tolist()[:DATA_TO_USE]
labels = np.array(sentiment_data['sentiment'])[:DATA_TO_USE]

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, random_state=21)

## Exercise 2: word vectors meet bag of words

In this exercise you use your newly trained word vectors and a simple Bag of Words models to approach the sentiment analysis task

In [2]:
# We will use a convinient wrapper for our word2vec model provided by gensim

In [5]:
w2v_model = KeyedVectors.load_word2vec_format("./simple_cbow.w2v")

In [5]:
# you can get the vector for a word in a simple way
w2v_model['word']

array([ 0.0688487 , -0.169258  , -0.0235873 ,  0.0768677 , -0.0560077 ,
       -0.20571201,  0.0722242 , -0.0238068 ,  0.0942261 ,  0.0445618 ,
       -0.0930745 , -0.132294  , -0.0380145 , -0.0885391 ,  0.0289901 ,
        0.0770231 , -0.122311  , -0.0118321 ,  0.13177601, -0.0227252 ,
        0.16474199, -0.0572786 ,  0.0745815 , -0.0760634 ,  0.0295188 ,
       -0.0441017 , -0.0895143 , -0.0566907 ,  0.0281097 , -0.00958456,
       -0.110452  ,  0.0415194 , -0.0153895 , -0.0745323 ,  0.00762951,
       -0.0319146 , -0.0557301 , -0.0886996 ,  0.0806743 , -0.0471586 ,
       -0.0859856 , -0.20486601, -0.118638  ,  0.111954  ,  0.0344212 ,
       -0.0233198 ,  0.0737764 ,  0.0346345 , -0.0558539 , -0.211096  ,
       -0.0435312 ,  0.115727  , -0.0596598 , -0.0322688 , -0.0538118 ,
        0.0722077 ,  0.0208389 , -0.15472899,  0.0143741 , -0.126031  ,
        0.0622076 ,  0.032778  ,  0.0305527 ,  0.0810955 ,  0.010537  ,
        0.0163742 , -0.0827605 , -0.0131961 ,  0.0289909 , -0.06

In [6]:
# you can easily query the model for word most similar to a give word 
w2v_model.most_similar('funny')

[('cool', 0.5530734062194824),
 ('good', 0.4626672565937042),
 ('nice', 0.44066357612609863),
 ('awesome', 0.438425213098526),
 ('exciting', 0.43373095989227295),
 ('weird', 0.43178021907806396),
 ('cute', 0.40261897444725037),
 ('interesting', 0.3848278522491455),
 ('awesome!', 0.3723982572555542),
 ('amazing', 0.36828142404556274)]

### 2.1

In this exercise you learn how to encode sentences with word2vec using a bag of words approach

In [6]:
# implement a tokenizer that you will use throughout the exercise
# I would recommend a regexp tokenizer for speed, but it's completely up to you
def my_tokenizer(text):
    return nltk.regexp_tokenize(text, '\w+')

In [7]:
def bow_encoder(wmodel, tokenizer, text):
    """
    This function encodes text into a vector.
    
    First, it tokenizes input text using the provided tokenizer function.
    Then it uses the provided word2vec model to get the vectors corresponding to text's tokens.
    Finally, it computes an average of all token's vectors and returns it.
    
    If the function failed to find and encode any words, it should at least return a vector of zeros.
    """
    tokens = tokenizer(text)
    
    zero_vector = np.zeros(w2v_model.vector_size)
    word_vectors = []
    
    for token in tokens:
        if token in wmodel:
            word_vectors.append(wmodel[token]*tfidfmodel[token])
            
    if len(word_vectors):
        sent_vector = np.mean(word_vectors, axis=0)
    else:
        sent_vector = zero_vector
    # your code goes here
    return sent_vector

Now use your new encoder to encode both train_texts and test_texts into matrices.

The number of rows in a matrix should be equal to the number of texts encoded.

The number of columns should be equal to the word2vec space dimansionality (currently = 128)

Just write a little loop.

In [11]:
def encode_sentences_with_bow_encoder(sentences, w2v_model, tokenizer):
    return np.array([bow_encoder(w2v_model, my_tokenizer, t) 
                     for t in tqdm(sentences)] )

In [12]:
train_encoded = encode_sentences_with_bow_encoder(train_texts, w2v_model, my_tokenizer)
test_encoded = encode_sentences_with_bow_encoder(test_texts, w2v_model, my_tokenizer)


  0%|          | 0/187500 [00:00<?, ?it/s][A
  0%|          | 818/187500 [00:00<00:22, 8175.66it/s][A
  1%|          | 1589/187500 [00:00<00:23, 8028.30it/s][A
  1%|▏         | 2366/187500 [00:00<00:23, 7946.98it/s][A
  2%|▏         | 3181/187500 [00:00<00:23, 8005.82it/s][A
  2%|▏         | 4052/187500 [00:00<00:22, 8203.64it/s][A
  3%|▎         | 4894/187500 [00:00<00:22, 8266.62it/s][A
  3%|▎         | 5626/187500 [00:00<00:28, 6291.58it/s][A
  3%|▎         | 6260/187500 [00:00<00:29, 6127.84it/s][A
  4%|▎         | 6963/187500 [00:00<00:28, 6290.14it/s][A
  4%|▍         | 7710/187500 [00:01<00:27, 6602.11it/s][A
  4%|▍         | 8393/187500 [00:01<00:26, 6667.18it/s][A
  5%|▍         | 9065/187500 [00:01<00:28, 6316.46it/s][A
  5%|▌         | 9768/187500 [00:01<00:27, 6514.24it/s][A
  6%|▌         | 10427/187500 [00:01<00:28, 6300.35it/s][A
  6%|▌         | 11091/187500 [00:01<00:27, 6397.93it/s][A
  6%|▋         | 11910/187500 [00:01<00:25, 6846.92it/s][A
  7%|▋ 

 55%|█████▍    | 102527/187500 [00:14<00:11, 7590.61it/s][A
 55%|█████▌    | 103313/187500 [00:14<00:10, 7667.61it/s][A
 56%|█████▌    | 104122/187500 [00:14<00:10, 7789.55it/s][A
 56%|█████▌    | 104974/187500 [00:15<00:10, 7994.18it/s][A
 56%|█████▋    | 105782/187500 [00:15<00:10, 7705.36it/s][A
 57%|█████▋    | 106561/187500 [00:15<00:10, 7382.19it/s][A
 57%|█████▋    | 107350/187500 [00:15<00:10, 7525.57it/s][A
 58%|█████▊    | 108181/187500 [00:15<00:10, 7744.87it/s][A
 58%|█████▊    | 109044/187500 [00:15<00:09, 7989.99it/s][A
 59%|█████▊    | 109875/187500 [00:15<00:09, 8081.91it/s][A
 59%|█████▉    | 110692/187500 [00:15<00:09, 8106.52it/s][A
 59%|█████▉    | 111538/187500 [00:15<00:09, 8207.53it/s][A
 60%|█████▉    | 112406/187500 [00:15<00:09, 8342.58it/s][A
 60%|██████    | 113258/187500 [00:16<00:08, 8394.78it/s][A
 61%|██████    | 114100/187500 [00:16<00:08, 8200.00it/s][A
 61%|██████▏   | 114923/187500 [00:16<00:11, 6253.00it/s][A
 62%|██████▏   | 115620/

In [13]:
assert isinstance(train_encoded, np.ndarray)
assert isinstance(test_encoded, np.ndarray)

assert train_encoded.shape[0] == len(train_texts)
assert train_encoded.shape[1] == w2v_model.vector_size

assert test_encoded.shape[0] == len(test_texts)
assert test_encoded.shape[1] == w2v_model.vector_size
print('done')

done


In [14]:
clf = LinearSVC()
clf.fit(train_encoded, train_labels)
preds = clf.predict(test_encoded)

print(classification_report(test_labels, preds))
print("AUC = {}".format(roc_auc_score(test_labels, preds)))

             precision    recall  f1-score   support

          0       0.64      0.66      0.65     31237
          1       0.65      0.62      0.64     31263

avg / total       0.64      0.64      0.64     62500

AUC = 0.6429359320639207


### Not too impressive!

### 2.2

In this exercise you attempt to improve your encoder by filtering out stop words.

In [15]:
def bow_encoder_with_stopwords(wmodel, tokenizer, stopwords, text):
    
    zero_vector = np.zeros(w2v_model.vector_size)
    
    tokens = tokenizer(text)
    
    zero_vector = np.zeros(w2v_model.vector_size)
    word_vectors = []
    
    for token in tokens:
        if token in wmodel and token.lower() not in stopwords:
            word_vectors.append(wmodel[token])
    if len(word_vectors):
        sent_vector = np.mean(word_vectors, axis=0)
    else:
        sent_vector = zero_vector
        
    return sent_vector

In [16]:
stops = set(nltk.corpus.stopwords.words('english'))

In [15]:
train_encoded = 
test_encoded = 

100%|██████████| 187500/187500 [00:16<00:00, 11398.73it/s]
100%|██████████| 62500/62500 [00:05<00:00, 12188.47it/s]


In [None]:
clf = LinearSVC()
clf.fit(train_encoded, train_labels)
preds = clf.predict(test_encoded)

print(classification_report(test_labels, preds))
print("AUC = {}".format(roc_auc_score(test_labels, preds)))

## Looks like the BoW model is not too good for the job!

![architecture](pics/we_need_to_go_deeper.jpg)

## Introducing: Keras

Keras is a cool library built on top of the computational backend provided by Tensorflow. It provides a layer of abstraction between you and complicated tensor algebra, allowing for rapid prototyping of deep neural networks.

### 3.1: Data preparation

Before we start crunching word vectors with convolutional neural networks, we need to prepare our data.

In [20]:
import keras

Using TensorFlow backend.


In [22]:
# load the vocabulary we created earlier
voc, rvoc = pickle.load(open("../dict_rdict.pkl","rb"))

In [28]:
# we are going to use the whole dataset this time around
texts = sentiment_data['message'].tolist()
labels = np.array(sentiment_data['sentiment'])

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, random_state=21)

Recall the function that turns tokens into their ids

In [25]:
# hint: you may want to use the function you've built during seminar 2
MAX_LEN = 32

def vectorize_tokens(sentence, tokenizer, token_to_id, max_len):
   
    tokens = tokenizer(sentence)
    ids = []    
    for token in tokens:
        ids.append(token_to_id.get(token, token_to_id["UNKN"]))
    if len(ids) < max_len:
        ids += (max_len-len(ids))*[token_to_id["NULL"]]
    else:
        ids = ids[:max_len]

    return ids

Now apply the vectorization function to every sentence from train and test datasets. In the end you should end up with a matrix of shape [len(data), MAX_LEN].

Just write a little loop

In [26]:
def vectorize_sentences(sentences, tokenizer, token_to_id, max_len):
    sentence_ids = []
    
    for sentence in tqdm(sentences):
        sentence_ids.append(vectorize_tokens(sentence, tokenizer, token_to_id, max_len))
        
    return np.array(sentence_ids)

In [29]:
train_vectorized = vectorize_sentences(train_texts, my_tokenizer, voc, MAX_LEN)
test_vectorized = vectorize_sentences(test_texts, my_tokenizer, voc, MAX_LEN)


  0%|          | 0/937500 [00:00<?, ?it/s][A
  0%|          | 139/937500 [00:00<18:13, 856.98it/s][A
  0%|          | 3783/937500 [00:00<12:50, 1212.03it/s][A
  1%|          | 6567/937500 [00:00<09:07, 1699.75it/s][A
  1%|          | 10158/937500 [00:00<06:29, 2379.94it/s][A
  2%|▏         | 14089/937500 [00:00<04:38, 3313.91it/s][A
  2%|▏         | 17287/937500 [00:00<03:23, 4532.83it/s][A
  2%|▏         | 20927/937500 [00:00<02:29, 6147.15it/s][A
  3%|▎         | 24025/937500 [00:00<01:54, 7955.00it/s][A
  3%|▎         | 26954/937500 [00:01<01:34, 9644.76it/s][A
  3%|▎         | 29578/937500 [00:01<01:16, 11896.31it/s][A
  3%|▎         | 32635/937500 [00:01<01:02, 14565.00it/s][A
  4%|▍         | 35387/937500 [00:01<00:53, 16765.90it/s][A
  4%|▍         | 38566/937500 [00:01<00:46, 19535.56it/s][A
  4%|▍         | 41634/937500 [00:01<00:40, 21924.32it/s][A
  5%|▍         | 45361/937500 [00:01<00:35, 25012.29it/s][A
  5%|▌         | 48518/937500 [00:01<00:37, 23949.29

In [30]:
assert isinstance(train_vectorized, np.ndarray)
assert isinstance(test_vectorized, np.ndarray)

assert train_vectorized.shape == (len(train_vectorized), MAX_LEN)
assert test_vectorized.shape == (len(test_vectorized), MAX_LEN)

print('done')

done


### 3.2 Building a deep NN

In [33]:
import keras

Using TensorFlow backend.


In [34]:
embeddings_matrix = w2v_model.syn0

In [35]:
# keras Input layer is basically the same thing as tf.placeholder
# it defines a node where the network will be expecting to recieve input data
input_layer = keras.layers.Input(shape=(MAX_LEN,))

In [36]:
# keras Embedding layer is a container for dense vectors
# it recieves a list of token identifiers of shape [MAX_LEN] 
# and turns it into a matrix of shape [MAX_LEN, EMBEDDING_DIM]

embedding_layer = keras.layers.Embedding(embeddings_matrix.shape[0], embeddings_matrix.shape[1], 
                                         input_length=MAX_LEN, weights=[embeddings_matrix],
                                         trainable=False)(input_layer)
# notice how the input_layer is plugged into the embedding_layer

In [37]:
# keras Convolutional layer implements a set of learnable filters
# that extract local patterns from input data
convolution_layer = keras.layers.Convolution1D(128, 3)(embedding_layer)

In [38]:
# keras GlobalMaxPooling layer applies a max filter to the input feature representation
# only the strongest responses from the previous layer are kept, everything else is discarded
subsampling_layer = keras.layers.GlobalMaxPooling1D()(convolution_layer)

In [39]:
# keras Linear layers apply a simple linear transformation to input data, 
# which is optionally followed by a non-linear activation function
# very useful for building Multi-Layer Perceptrons
linear_layer_1 = keras.layers.Dense(64, activation='relu')(subsampling_layer)
linear_layer_2 = keras.layers.Dense(1, activation='sigmoid')(linear_layer_1)

In [40]:
# this compiles the computational graph we've just created, applies a loss function
# and pre-computes the gradients for back propagation

deep_model = keras.models.Model(inputs=[input_layer], outputs=[linear_layer_2])
deep_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [41]:
deep_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 32)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 32, 128)           6400000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 30, 128)           49280     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 6,457,601
Trainable params: 57,601
Non-trainable params: 6,400,000
_____________________________________________________________

In [None]:
deep_model.fit(x=train_vectorized, y=train_labels, batch_size=64, epochs=1, 
               validation_data=[test_vectorized, test_labels])

Train on 937500 samples, validate on 312500 samples
Epoch 1/1

In [78]:
preds = deep_model.predict(test_vectorized)

In [79]:
print("AUC = {}".format(roc_auc_score(test_labels, preds)))

AUC = 0.865941180432684


### Thats more like it! Keep in mind that we only trained a tiny (57k parameters) model because of the limitations of CPU computing power. Using a deeper model with more trainable filters in the Convolution layer would likely result in even stronger predictive power. Stay tuned! 