# DL in NLP

## Task 2: Classifying TED talks

Sergei Volodin, senior undergraduate student at MIPT

In [21]:
%matplotlib inline
import tensorflow as tf
import json
import numpy as np
import pandas as pd
import collections
from __future__ import print_function
import re
import random
import math
from sklearn.manifold import TSNE
from matplotlib import pylab
from transliterate import translit
from six.moves import range
import sklearn.metrics
import matplotlib.pyplot as plt
from six.moves import cPickle as pickle
from tqdm import tqdm

In [2]:
def print_unicode(ent):
    print(repr(ent).decode("unicode-escape"))

In [3]:
filename = 'ted_ru-20160408.json'
data_test = json.loads(open(filename + '.test', 'r').read(), 'unicode-escape')
data_train = json.loads(open(filename + '.train', 'r').read(), 'unicode-escape')

In [4]:
class_to_vector = {
    1: [0,0,0],
    2: [1,0,0],
    3: [0,1,0],
    4: [0,0,1],
    5: [1,1,0],
    6: [1,0,1],
    7: [0,1,1],
    8: [1,1,1]
}

In [13]:
class_to_vector = {
    1: [-1,-1,-1],
    2: [+1,-1,-1],
    3: [-1,+1,-1],
    4: [-1,-1,+1],
    5: [+1,+1,-1],
    6: [+1,-1,+1],
    7: [-1,+1,+1],
    8: [+1,+1,+1]
}

Plan:

1. Removing non-letters
2. Obtaining words, training word2vec CBOW model
3. Running RNN on document
4. Classifying based on final hidden state

In [14]:
all_train_text = ' '.join(map(lambda x : x['content'], data_train))

In [15]:
words_regex = re.compile(ur'[^а-яА-ЯёЁa-zA-Z]')
def str_to_words(s):
    return(words_regex.sub(' ', s).lower().split())
train_words = str_to_words(all_train_text)

In [16]:
vocabulary_size = 50000

def build_dataset(words, vocabulary_size):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(train_words, vocabulary_size)
print_unicode(count[:5])
print_unicode(data[:10])

[['UNK', 139493], (u'и', 92090), (u'в', 83510), (u'что', 57304), (u'я', 47139)]
[4, 49, 42333, 12866, 7686, 21, 4461, 740, 105, 31181]


In [17]:
data_index = 0

def to_range(n):
    return(n % len(data))

# CBOW model
def generate_batch(batch_size, context_size):
    global data_index
  
    data_index = to_range(data_index)
    
    batch = np.ndarray(shape=(batch_size, context_size * 2), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    for i in range(batch_size):
        labels[i] = data[data_index]
        for j in range(0, context_size):
            batch[i][2 * j] = data[to_range(data_index - (j + 1))]
            batch[i][2 * j + 1] = data[to_range(data_index + (j + 1))]
        data_index = to_range(data_index + 1)
        
    return batch, labels

In [167]:
batch_size = 128
embedding_size = 300 # Dimension of the embedding vector.
context_size = 3
num_sampled = 64
graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size, context_size * 2], name = "train_dataset")
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1], name = "train_labels")
  
    # Variables.
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), name = "embeddings")
    softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)), name = "SM_weights")
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]), name = "SM_biases")
    
    embed = tf.reduce_mean(tf.nn.embedding_lookup(embeddings, train_dataset), [1], name = "emb_result")
    loss = tf.reduce_mean(
      tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                               labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size), name = "loss")

  
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
    
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    
    tf.summary.scalar("Embedding_loss", loss)
    summary_emb = tf.summary.merge_all()

In [193]:
logs_path = '/tmp/nlp2/11'
writer = tf.summary.FileWriter(logs_path, graph=graph)

In [169]:
epochs = 20000
with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    for epoch in tqdm(range(epochs)):
        batch_data, batch_labels = generate_batch(batch_size, context_size)
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        _, l, summary = session.run([optimizer, loss, summary_emb], feed_dict = feed_dict)
        writer.add_summary(summary, epoch)
    final_embeddings = normalized_embeddings.eval()

100%|██████████| 20000/20000 [04:17<00:00, 77.69it/s]


In [170]:
emb_fn = "emb1.pkl"
pickle.dump(final_embeddings, open(emb_fn, 'w'))
#final_embeddings = pickle.load(open(emb_fn, 'r'))

In [171]:
pad_word = vocabulary_size
max_words = 5600

In [172]:
def str_to_idx(s):
    words = str_to_words(s)
    res = []
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
        res.append(index)
    return(res)
def get_Y(data):
    classes = map(lambda x: x['class'], data_train)
    res = map(lambda x: class_to_vector[x], classes)
    res = np.array(res)#.reshape(-1, 3, 1)
    return(res)
def get_X(data):
    X = map(lambda x : str_to_idx(x['content']), data)
    L = map(lambda x : len(x), X)
    X = map(lambda x : x + [pad_word] * (max_words - len(x)), X)
    return np.array(X), np.array(L)

In [173]:
X, L = get_X(data_train)

In [174]:
Y = get_Y(data_train)

In [194]:
c_graph = tf.Graph()
with c_graph.as_default(), tf.device('/cpu:0'):
    text_input = tf.placeholder(tf.int32, shape=[None, None])
    text_length = tf.placeholder(tf.float32, shape=[None])
    ans_input = tf.placeholder(tf.float32, shape=[None, 3])
    
    changeable_embeddings = tf.Variable(final_embeddings, name = "embeddings", trainable = True, dtype = tf.float32)
    non_changeable_embeddings = tf.Variable(1. * np.zeros((1, embedding_size)), trainable = False, dtype = tf.float32)
    embeddings = tf.concat([changeable_embeddings, non_changeable_embeddings], 0)
    
    text_embeddings = tf.reshape(tf.nn.embedding_lookup(embeddings, text_input), [-1, max_words, embedding_size, 1])
    
    #text_mean = tf.reduce_sum(text_embeddings, [1]) / tf.reshape(text_length, [-1, 1])
    
    num_filters = 128
    pooled_outputs = []
    filter_sizes = [2, 3, 5]
    for filter_size in filter_sizes:
        with tf.name_scope("conv-maxpool-%s" % filter_size):
            # Convolution Layer
            filter_shape = [filter_size, embedding_size, 1, num_filters]
            W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
            b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
            conv = tf.nn.conv2d(
                text_embeddings,
                W,
                strides=[1, 1, 1, 1],
                padding="VALID",
                name="conv")
            # Apply nonlinearity
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
            # Max-pooling over the outputs
            pooled = tf.nn.max_pool(
                h,
                ksize=[1, max_words - filter_size + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID',
                name="pool")
            pooled_outputs.append(pooled)
 
    # Combine all the pooled features
    num_filters_total = num_filters * len(filter_sizes)
    h_pool = tf.concat(pooled_outputs, 3)
    h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
    
    h_drop = tf.nn.dropout(h_pool_flat, 0.5)
    
    dense1 = tf.layers.dense(inputs = h_drop, units = 20, activation=tf.nn.relu)
    
    dense1d = tf.nn.dropout(dense1, 0.9)
    
    dense = tf.layers.dense(inputs = dense1d, units = 3, activation=tf.nn.tanh)
   
    # hinge loss
    #c_loss = tf.reduce_mean(tf.square(1 - tf.multiply(ans_input, dense)))
    
    # square loss
    c_loss =  tf.reduce_mean(tf.squared_difference(ans_input, dense))
    
    accuracy = tf.reduce_mean(tf.contrib.metrics.accuracy(tf.to_int32(ans_input), tf.to_int32(tf.sign(dense))))
    
    c_optimizer = tf.train.AdagradOptimizer(1).minimize(c_loss)
    
    tf.summary.scalar("Classification_loss", c_loss)
    tf.summary.scalar("Classification_accuracy", accuracy)
    c_summary = tf.summary.merge_all()
    
    c_initializer = tf.global_variables_initializer()
    c_writer = tf.summary.FileWriter(logs_path, graph)

In [195]:
c_sess = tf.Session(graph=c_graph)
c_sess.run(c_initializer)
rolling_epoch = 0

In [196]:
epochs = 100
c_batch_size = 60
for epoch in range(epochs):
    idx = random.sample(range(len(X)), c_batch_size)
    feed_dict = {text_input: X[idx], ans_input: Y[idx], text_length: L[idx]}
    _, l, summary, a = c_sess.run([c_optimizer, c_loss, c_summary, accuracy], feed_dict = feed_dict)
    c_writer.add_summary(summary, rolling_epoch)
    rolling_epoch += 1
    print(epoch, l, a)

0 2.08081 0.333333
1 0.777778 0.805556
2 0.866667 0.783333
3 0.955556 0.761111
4 0.866667 0.783333
5 0.977778 0.755556
6 0.688889 0.827778
7 0.688889 0.827778
8 0.822222 0.794444
9 0.688889 0.827778
10 0.777778 0.805556
11 0.955556 0.761111
12 0.822222 0.794444
13 0.822222 0.794444
14 0.688889 0.827778
15 0.8 0.8
16 0.8 0.8
17 0.777778 0.805556
18 1.02222 0.744444
19 0.711111 0.822222
20 0.755556 0.811111
21 0.777778 0.805556
22 0.666667 0.833333
23 0.733333 0.816667
24 0.8 0.8
25 0.911111 0.772222
26 0.8 0.8
27 0.866667 0.783333
28 0.733333 0.816667
29 0.711111 0.822222
30 0.844444 0.788889
31 0.755556 0.811111
32 0.755556 0.811111
33 0.6 0.85
34 0.688889 0.827778
35 0.977778 0.755556
36 0.711111 0.822222
37 0.844444 0.788889
38 0.911111 0.772222
39 0.844444 0.788889
40 0.888889 0.777778
41 0.6 0.85
42 0.777778 0.805556
43 0.666667 0.833333
44 0.777778 0.805556
45 0.888889 0.777778
46 0.622222 0.844444


KeyboardInterrupt: 

In [149]:
c_sess.run(embeddings, feed_dict = {text_input: X[1:10], ans_input: Y[1:10], text_length: L[1:10]})

array([[ 0.06508286, -0.05798488, -0.03311485, ...,  0.09455594,
        -0.0562804 , -0.1043473 ],
       [-0.0309836 ,  0.00573219,  0.07778752, ...,  0.07446673,
         0.07328863, -0.033524  ],
       [ 0.07694534,  0.035741  ,  0.11145366, ...,  0.02199232,
         0.02609545,  0.02331406],
       ..., 
       [ 0.00734618, -0.09801039,  0.08511987, ..., -0.09779865,
        -0.09917582,  0.01881039],
       [-0.00272191,  0.08712737, -0.13532345, ...,  0.04006055,
        -0.07651526,  0.09913132],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)

In [179]:
np.linalg.norm(c_sess.run(embeddings)[:vocabulary_size]-final_embeddings)

0.06506066

In [197]:
test_idx = random.sample(range(len(X)), 200)

In [198]:
Yp = predict_proba(X, L, test_idx)
def plot_roc(Yt, Yp):
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(Yt, Yp)
    plt.plot(fpr, tpr, color='darkorange')
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

KeyboardInterrupt: 

In [None]:
for i in range(Yp.shape[1]):
    plot_roc(Y[test_idx, i], Yp[:, i])

# Test

In [154]:
X_test, L_test = get_X(data_test)

In [83]:
def predict_proba_small(X, L, idx):
    r = c_sess.run(dense, feed_dict = {text_input: X[idx], text_length: L[idx]})
    return(r)
def predict_proba(X, L, idx):
    s = 0
    batch_sz = 10
    idx = list(idx)
    res = predict_proba_small(X, L, idx[s:s+batch_sz])
    s += batch_sz
    while s < len(idx):
        res = np.vstack((res, predict_proba_small(X, L, idx[s:s+batch_sz])))
        s += batch_sz
    return(res)
def cut(val, a = 0):
    return np.sign(val + a)

In [98]:
def get_accuracy(X, Y, idx, a = 0):
    return np.mean(cut(predict_proba(X, L, idx), a) == Y[idx])
def get_classes(Y):
    res = []
    for i in range(Y.shape[0]):
        for it in class_to_vector.items():
            if list(np.array(Y[i], dtype = np.int)) == it[1]:
                res.append(it[0])
                break
    return(np.array(res))

In [155]:
np.mean(Y[500:1000])

-0.64000000000000001

In [99]:
get_accuracy(X, Y, range(500,1000), 0.1)

0.81999999999999995

In [183]:
Yp_proba = predict_proba(X_test, L_test, range(len(X_test)))

In [184]:
Yp_test = cut(Yp_proba, 0)

In [185]:
Y_cl = get_classes(Yp_test)

In [189]:
np.sum(Y_cl != 1)

29

In [190]:
f = open('output.csv', 'w')
f.write("id,class\n")
for i in range(len(Y_cl)):
    f.write("{},{}\n".format(data_test[i]['@id'], Y_cl[i] - 1))
f.close()