# DL in NLP

## Task 2: Classifying TED talks

Sergei Volodin, senior undergraduate student at MIPT

In [1]:
%matplotlib inline
import tensorflow as tf
import json
import numpy as np
import pandas as pd
import collections
from __future__ import print_function
import re
import random
import math
from sklearn.manifold import TSNE
from matplotlib import pylab
from transliterate import translit
from six.moves import range
import sklearn.metrics
import matplotlib.pyplot as plt
from six.moves import cPickle as pickle
from tqdm import tqdm

In [2]:
def print_unicode(ent):
    print(repr(ent).decode("unicode-escape"))

In [3]:
filename = 'ted_ru-20160408.json'
data_test = json.loads(open(filename + '.test', 'r').read(), 'unicode-escape')
data_train = json.loads(open(filename + '.train', 'r').read(), 'unicode-escape')

Plan:

1. Removing non-letters
2. Obtaining words, training word2vec CBOW model
3. Running RNN on document
4. Classifying based on final output

# Building dictionary

In [4]:
vocabulary_size = 100000
embedding_size = 256
context_size = 4
words_regex = re.compile(ur'[^а-яА-ЯёЁa-zA-Z]')

def str_to_words(s):
    return(words_regex.sub(' ', s).lower().split())

def data_to_str(data):
    return(' '.join(map(lambda x : x['content'], data_train)))

def build_dataset(words, vocabulary_size):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count = unk_count + 1
        data.append(index)
    count[0][1] = unk_count
    return data, count, dictionary

In [5]:
all_text = data_to_str(data_train)# + data_to_str(data_test)
words = str_to_words(all_text)
data, count, dictionary = build_dataset(words, vocabulary_size)

In [6]:
print_unicode(count[:5])
print_unicode(data[:10])

[['UNK', 50679], (u'и', 92090), (u'в', 83510), (u'что', 57304), (u'я', 47139)]
[4, 49, 42333, 12866, 7686, 21, 4461, 740, 105, 31181]


# Preparing dataset

In [11]:
pad_word = vocabulary_size
num_classes = 8

In [12]:
def str_to_idx(s):
    words = str_to_words(s)
    res = []
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
        res.append(index)
    return(res)
def get_Y(data):
    classes = map(lambda x: x['class'] - 1, data_train)
    return(np.array(classes))
def get_X(data):
    X = map(lambda x : str_to_idx(x['content']), data)
    L = map(len, X)
    return np.array(X), np.array(L)

In [13]:
X, L = get_X(data_train)

In [14]:
Y = get_Y(data_train)

In [15]:
Xtest, Ltest = get_X(data_test)

In [24]:
def get_wf(X):
    N = len(X)
    res = np.zeros((N, vocabulary_size))
    for i in range(N):
        M = len(X[i])
        for j in tqdm(range(M):
            res[i][X[i][j]] += 1
        res[i] /= N
    return(res)

In [25]:
Xcount = get_wf(X)

In [None]:
XTestCount = get_wf(Xtest)

In [28]:
Xcount

array([[ 0.02439024,  0.0652439 ,  0.07317073, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.00609756,  0.00365854,  0.00670732, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.0054878 ,  0.01463415,  0.00670732, ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.03414634,  0.05243902,  0.03902439, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.01280488,  0.03109756,  0.04329268, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.0195122 ,  0.04634146,  0.04268293, ...,  0.        ,
         0.        ,  0.        ]])

In [95]:
logs_path = '/home/sergei/tensorboard/cl/06'
c_graph = tf.Graph()
with c_graph.as_default():
    inp = tf.placeholder(tf.float32, shape=[None, vocabulary_size])
    ans = tf.placeholder(tf.int32, shape=[None])
    
    keep_prob = tf.placeholder(tf.float32)
    
    labels = tf.one_hot(ans, num_classes)
    dense = tf.layers.dense(inputs = tf.nn.dropout(inp, keep_prob), units=128, activation=tf.nn.relu)
    logits = tf.contrib.layers.fully_connected(dense, num_classes, activation_fn = None)
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = labels, logits = logits))
    optimizer = tf.train.RMSPropOptimizer(0.05).minimize(loss)
    
    summary = tf.summary.scalar("Classification_loss", loss)
    v_summary = tf.summary.scalar("validation_loss", loss)
    summary = tf.summary.merge_all()
    
    answer = tf.argmax(logits, 1)
    probability = tf.nn.softmax(logits)
    
    initializer = tf.global_variables_initializer()
    writer = tf.summary.FileWriter(logs_path, c_graph)

In [101]:
c_sess.close()

In [96]:
c_sess = tf.Session(graph=c_graph)
c_sess.run(initializer)
rolling_epoch = 0

In [97]:
epochs = 10000
batch_size = 128
start_idx = 0
idx_valid = random.sample(range(len(X)), 100)
idx_train = np.setdiff1d(np.arange(X.shape[0]), idx_valid)
X1 = Xcount[idx_train]
Y1 = Y[idx_train]
N = len(X1)
for i in range(epochs):
    if start_idx >= N:
        print("loop")
        start_idx = 0
    idx = range(start_idx, min(start_idx + batch_size, N))
    feed_dict = {inp: X1[idx], ans: Y1[idx], keep_prob: 0.5}
    _, l, s = c_sess.run([optimizer, loss, summary], feed_dict = feed_dict)
    
    writer.add_summary(s, rolling_epoch)
    
    feed_dict = {inp: Xcount[idx_valid], ans: Y[idx_valid], keep_prob: 1}
    vl, vs = c_sess.run([loss, v_summary], feed_dict = feed_dict)
    
    writer.add_summary(vs, rolling_epoch)
    
    rolling_epoch += 1
    print(rolling_epoch, vl, l)
    start_idx += batch_size

1 2.06454 2.07944
2 2.04236 2.06299
3 2.01999 2.04115
4 2.00176 2.02971
5 1.97915 2.00257
6 1.95337 1.96949
7 1.92561 1.93242
8 1.89747 1.89774
9 1.87334 1.89245
10 1.84859 1.86235
11 1.82428 1.84147
12 1.79696 1.78588
13 1.7569 1.61201
loop
14 1.73567 1.75736
15 1.71184 1.69661
16 1.69077 1.69488
17 1.6768 1.76987
18 1.65826 1.68081
19 1.63746 1.608
20 1.61626 1.54434
21 1.59775 1.51012
22 1.58517 1.57835
23 1.57374 1.54784
24 1.56385 1.55066
25 1.55471 1.46587
26 1.54114 1.08486
loop
27 1.53631 1.53421
28 1.53228 1.44545
29 1.52963 1.49893
30 1.52638 1.66368
31 1.52329 1.5235
32 1.52018 1.44012
33 1.51784 1.36898
34 1.51802 1.35743
35 1.51325 1.4728
36 1.51145 1.44248
37 1.50879 1.46331
38 1.51058 1.39857
39 1.53397 0.897994
loop
40 1.51559 1.50346
41 1.51438 1.39483
42 1.51001 1.47333
43 1.51129 1.62377
44 1.50781 1.48356
45 1.50504 1.41712
46 1.50805 1.32486
47 1.51197 1.3318
48 1.50261 1.43468
49 1.50616 1.40846
50 1.50486 1.42987
51 1.51103 1.3984
52 1.59167 0.85988
loop
53 1.531

KeyboardInterrupt: 

In [98]:
feed_dict = {inp: XTestCount, keep_prob: 1}
a = c_sess.run(answer, feed_dict = feed_dict)

In [99]:
a

array([0, 0, 5, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 0, 1, 1, 1, 5, 0, 0, 3, 0,
       0, 0, 1, 0, 5, 1, 0, 3, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0,
       5, 0, 3, 0, 0, 1, 0, 0, 1, 0, 3, 0, 0, 5, 0, 0, 0, 0, 0, 1, 0, 0, 5,
       0, 2, 3, 0, 2, 0, 0, 0, 0, 1, 5, 0, 1, 3, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 2, 0, 5, 0, 0, 2, 0, 0, 0, 1, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 2, 3, 0, 0, 0, 3, 0, 0, 0, 1, 1, 0, 3, 0, 0,
       0, 0, 5, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 3, 5, 0, 0, 1, 1, 0, 0, 5, 0, 1, 0, 0, 0, 0, 2, 5, 0, 0, 0,
       0, 0, 2, 0, 0, 1, 0, 0, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,
       1, 5, 1, 3, 1, 0, 1, 3, 0, 0, 0, 5, 1, 0, 5, 0, 0, 2, 3, 2, 0, 0, 0,
       1, 5, 5, 0, 1, 3, 0, 0, 0, 0, 0, 5, 0, 0, 0, 3, 1, 0, 5, 1, 0, 0, 0,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 1, 0, 3, 0, 0, 0, 5, 2,
       0, 0, 1, 1, 0, 3, 3, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 0, 2, 0, 0, 0, 2,
       3, 0,

In [100]:
f = open('output.csv', 'w')
f.write("id,class\n")
for i in range(len(Xtest)):
    f.write("{},{}\n".format(data_test[i]['@id'], int(a[i])))
f.close()