In [1]:
'''
Notebook created by: Gabriele Sottocornola
for the M.Sc. class of Data & Text Mining
Università degli studi di Milano-bicocca
'''
import numpy as np
import tensorflow as tf
import pandas as pd

This is an advanced python notebook that gives you an introduction to word embedding as a feature constructor for classification. The task consists on the sentiment classification of a set of movie reviews provided by users in IMDb. In order to do that we exploit the pre-trained 100d GloVe model (download it from the nlp.stanford website).

After some pre-process we construct a padded (with the same SEQUENCE_LENGHT words) 3D matrix of word with their 100d representation in the GloVe embedding. 
Finally we can inject the created embedding lookup into a CNN (very similar to the MNIST one) and proceed to the training of the parameters for the classification task.

In [2]:
### PARAMETERS ###

GLOVE_PATH = '.\\data\\glove.6B.100d.txt' # 100d GloVe embedding --> http://nlp.stanford.edu/data/glove.6B.zip
DATA_PATH = '.\\data\\IMDb_reviews_sentiment_classification.csv'
MAX_NB_WORDS = 50000
EMBEDDING_DIM = 100
SEQUENCE_LENGHT = 1000
TEST_SIZE = 2000

In [3]:
# import data

review_df = pd.read_csv(DATA_PATH, header=0, sep=',', encoding='latin')
review_df = review_df.sample(frac=1) # shuffle data

text_list = review_df['review'].tolist() # extract reviews into a list
label_list = review_df['class'].tolist() # extract labels into a list
label_list = [[0, 1] if el == 1 else [1, 0] for el in label_list] # labels one-hot encoding

In [4]:
# vectorize the text samples into a 2D integer tensor

tokenizer = tf.contrib.keras.preprocessing.text.Tokenizer(num_words=MAX_NB_WORDS) # tokenizer: convert text to sequences of ids
tokenizer.fit_on_texts(text_list) # fit list of text into tokenizer
sequences = tokenizer.texts_to_sequences(text_list) 

word_index = tokenizer.word_index # complete dictionary
print('Found {} unique tokens'.format(len(word_index)))
max_len = len(max(sequences, key=len)) # length of longest sequence
print('The length of the longest sequence is {} tokens'.format(max_len))

Found 123681 unique tokens
The length of the longest sequence is 2442 tokens


In [5]:
def pad_sequences(sequence_list, max_seq_length):
    
    padded_seq = list()
    for seq in sequence_list:
        if len(seq) < max_seq_length:
            seq = np.pad(seq, (0, max_seq_length - len(seq)), 'constant', constant_values=0) # pad with zeros
        else:
            seq = seq[:max_seq_length] # remove elements
        padded_seq.append(seq)
    
    return padded_seq

In [6]:
padded_seq = pad_sequences(sequences, SEQUENCE_LENGHT) # set sequences of tokens for each text to fixed length

In [7]:
# build index mapping words in the embeddings set to their embedding vector

embeddings_index = {} # dictionary word:embedded vector
with open(GLOVE_PATH, 'r', encoding='utf-8') as glove_f:    
    
    for line in glove_f:
        values = line.split()
        word = str(values[0])
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found {} word vectors of dimension {}'.format(len(embeddings_index), len(embeddings_index['the'])))

Found 400000 word vectors of dimension 100


In [8]:
# prepare embedding matrix

num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) # matrix of dimensions num_words (dictionary length), embedding length
for word, i in word_index.items():
    embedding_vector = None
    if i < MAX_NB_WORDS:
        # check if the word is one of the first MAX_NB_WORDS
        embedding_vector = embeddings_index.get(word)
    
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [9]:
embeddings_index['the']

array([-0.038194  , -0.24487001,  0.72812003, -0.39961001,  0.083172  ,
        0.043953  , -0.39140999,  0.3344    , -0.57545   ,  0.087459  ,
        0.28786999, -0.06731   ,  0.30906001, -0.26383999, -0.13231   ,
       -0.20757   ,  0.33395001, -0.33848   , -0.31742999, -0.48335999,
        0.1464    , -0.37303999,  0.34577   ,  0.052041  ,  0.44946   ,
       -0.46970999,  0.02628   , -0.54154998, -0.15518001, -0.14106999,
       -0.039722  ,  0.28277001,  0.14393   ,  0.23464   , -0.31020999,
        0.086173  ,  0.20397   ,  0.52623999,  0.17163999, -0.082378  ,
       -0.71787   , -0.41531   ,  0.20334999, -0.12763   ,  0.41367   ,
        0.55186999,  0.57907999, -0.33476999, -0.36559001, -0.54856998,
       -0.062892  ,  0.26583999,  0.30204999,  0.99774998, -0.80480999,
       -3.0243001 ,  0.01254   , -0.36941999,  2.21670008,  0.72201002,
       -0.24978   ,  0.92136002,  0.034514  ,  0.46744999,  1.10790002,
       -0.19358   , -0.074575  ,  0.23353   , -0.052062  , -0.22

In [10]:
train_seq = padded_seq[:-TEST_SIZE]
train_labels = label_list[:-TEST_SIZE]
test_seq = padded_seq[-TEST_SIZE:]
test_labels = label_list[-TEST_SIZE:]

print('Length of train data: {}'.format(len(train_seq)))
print('Length of test data: {}'.format(len(test_seq)))

Length of train data: 48000
Length of test data: 2000


In [11]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def conv1d(x, W):
    return tf.nn.conv1d(x, W, stride=1, padding='SAME')

def max_pool1d(x, stride):
    return tf.nn.max_pool(x, ksize=[1, 1, stride, 1], strides=[1, 1, stride, 1], padding='VALID')

In [13]:
# MODEL CONSTRUCTION (COMPUTATION GRAPH)
seq_input = tf.placeholder(tf.int64, [None, SEQUENCE_LENGHT])  #input text ids sequence
emb_W = tf.Variable(tf.zeros(shape=[num_words, EMBEDDING_DIM]), trainable=False)
emb_matrix = tf.placeholder(tf.float32, [num_words, EMBEDDING_DIM]) #inject embedding matrix into tf computational graph
emb_init = emb_W.assign(emb_matrix)
emb_layer = tf.nn.embedding_lookup(emb_W, seq_input) #lookup to the input sequence to the embedding vector space

W_conv1 = weight_variable([5, 100, 128])
b_conv1 = bias_variable([128])
h_conv1 = tf.nn.relu(conv1d(emb_layer, W_conv1) + b_conv1)

h_conv1 = tf.reshape(h_conv1, [-1, 1, 1000, 128])
h_pool1 = max_pool1d(h_conv1, 5)
h_pool1 = tf.reshape(h_pool1, [-1, 200, 128])

W_conv2 = weight_variable([5, 128, 128])
b_conv2 = bias_variable([128])
h_conv2 = tf.nn.relu(conv1d(h_pool1, W_conv2) + b_conv2)

h_conv2 = tf.reshape(h_conv2, [-1, 1, 200, 128])
h_pool2 = max_pool1d(h_conv2, 5)
h_pool2 = tf.reshape(h_pool2, [-1, 40, 128])

W_conv3 = weight_variable([5, 128, 128])
b_conv3 = bias_variable([128])
h_conv3 = tf.nn.relu(conv1d(h_pool2, W_conv3) + b_conv3)

h_conv3 = tf.reshape(h_conv3, [-1, 1, 40, 128])
h_pool3 = max_pool1d(h_conv3, 5)

W_fc1 = weight_variable([1024, 2])
b_fc1 = bias_variable([2])

h_flat = tf.reshape(h_pool3, [-1, 1024])
pred_fc = tf.matmul(h_flat, W_fc1) + b_fc1 #predicted vector

y_val = tf.placeholder(tf.int64, [None, 2]) #target class

In [14]:
# LOSS FUNCTION AND TRAINING STEP
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred_fc, labels=y_val))
cross_entropy = cross_entropy + tf.nn.l2_loss(W_fc1) # cross entropy loss with l2 normalization
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

# EVALUATION
prediction = tf.argmax(pred_fc, 1) #predicted class
correct_prediction = tf.equal(prediction, tf.argmax(y_val, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [17]:
# COMPUTATION PHASE
num_iter = 5000
batch_size = 512

train_acc_list = list()
test_acc_list = list()
step_list = list()

sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(emb_init, feed_dict={emb_matrix: embedding_matrix})

i = 1
while i <= num_iter:
    indeces = np.random.choice(np.arange(len(train_seq)), batch_size, replace=False)
    batch_xs = [train_seq[idx] for idx in indeces]
    batch_ys = [train_labels[idx] for idx in indeces]
    
    if i%100 == 0:
        train_accuracy = accuracy.eval(feed_dict={seq_input: batch_xs, y_val: batch_ys, emb_matrix: embedding_matrix})
        test_accuracy = accuracy.eval(feed_dict={seq_input: test_seq, y_val: test_labels, emb_matrix: embedding_matrix})
        print("step %d, training accuracy %g"%(i, train_accuracy))
        print("test accuracy %g"%(test_accuracy))
    train_step.run(feed_dict={seq_input: batch_xs, y_val: batch_ys, emb_matrix: embedding_matrix})
    i += 1

print("FINAL TEST ACCURACY %g"%accuracy.eval(feed_dict={seq_input: test_seq, y_val: test_labels, emb_matrix: embedding_matrix}))

step 100, training accuracy 0.578125
test accuracy 0.5495
step 200, training accuracy 0.617188
test accuracy 0.5755
step 300, training accuracy 0.626953
test accuracy 0.6045
step 400, training accuracy 0.65625
test accuracy 0.6355
step 500, training accuracy 0.708984
test accuracy 0.669
step 600, training accuracy 0.705078
test accuracy 0.6815
step 700, training accuracy 0.744141
test accuracy 0.703
step 800, training accuracy 0.753906
test accuracy 0.7155
step 900, training accuracy 0.761719
test accuracy 0.7295
step 1000, training accuracy 0.785156
test accuracy 0.732
step 1100, training accuracy 0.833984
test accuracy 0.7445
step 1200, training accuracy 0.818359
test accuracy 0.749
step 1300, training accuracy 0.826172
test accuracy 0.7535
step 1400, training accuracy 0.816406
test accuracy 0.7615
step 1500, training accuracy 0.806641
test accuracy 0.7655
step 1600, training accuracy 0.851562
test accuracy 0.77
step 1700, training accuracy 0.826172
test accuracy 0.7725
step 1800, tr

In [1]:
###############################################################################################################################

## Take-aways

+ Deep Learning can have excellent performance also on NLP tasks, i.e. sentiment analysis

+ CNNs are exploited also in modeling text for their capabilities to find local patterns in data

+ Dealing with text can be very tricky (due to the sparsity of data and the required computational power)

+ Importing pre-trained model is often a good choice (e.g. word2vec, GloVe)

+ Pre-trained models are also useful to initialize and fine tuning new embedding models