In [None]:
import numpy as np
import os
from random import shuffle
import re
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import collections
import time

%matplotlib inline

In [None]:
import urllib.request
import zipfile
import lxml.etree

In [None]:
# Download the dataset if it's not already there
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

In [None]:
# extract both the texts and the labels from the xml file
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
texts = doc.xpath('//content/text()')
labels = doc.xpath('//head/keywords/text()')
del doc

In [None]:
def extract_category(labels):
    technology = 'technology' in labels
    entertainment = 'entertainment' in labels
    design = 'design' in labels
    
    if technology and entertainment and design:
        return "TED"
    if entertainment and design:
        return "oED"
    if technology and design:
        return "ToD"
    if technology and entertainment:
        return "TEo"
    if entertainment:
        return "oEo"
    if technology:
        return "Too"
    if design:
        return "ooD"
    return "ooo"

categories = [extract_category(l) for l in labels]

# Build Word Embedding

## Convert a corpus into one-hot representation

In [None]:
all_tokens = []
all_sentences = []

for talk in texts:
    sentences = talk.split('\n')
    for sentence in sentences:
        tokens = re.sub(r"[^a-z0-9]+", " ", sentence.lower()).split()
        all_tokens.extend(tokens)
        all_sentences.append(sentence)

In [None]:
all_words = {}
for token in all_tokens:
    if token not in all_words:
        all_words[token] = 0
    all_words[token] += 1

In [None]:
from gensim.models import Word2Vec

In [None]:
from operator import itemgetter, attrgetter, methodcaller
counts_ted_top1000 = [i[1] for i in sorted(list(all_words.items()), key=itemgetter(1))[-1000:]]
words_ted_top1000 = [i[0] for i in sorted(list(all_words.items()), key=itemgetter(1))[-1000:]]

In [None]:
print(all_tokens[:5])

In [None]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [None]:
hist, edges = np.histogram(counts_ted_top1000, density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Top-1000 words distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

## Train Word2Vec Model

In [None]:
from gensim.models import Word2Vec

embedding_size = 500

model_ted = Word2Vec(all_sentences, size=embedding_size)

## Train neural classification model

Define neural model: assignment assumes we will use an embedding to convert from input text to labels

We will be using "bag of means" method to represent input text.

Bag of means representation where X is embedded vector for each word in sentence:

**x=sum(X)/len(N)**

## Construct dataset

In [None]:
classes = ["TED", "oED", "ToD", "TEo", "oEo", "Too", "ooD", "ooo"]
labels_raw = []
for c in categories:
    l = np.zeros(len(classes))
    l[classes.index(c)] = 1
    labels_raw.append(l)

encoded_talks = []
labels = []
for index, talk in enumerate(texts):
    sentences = talk.split('\n')
    sentence_vector = np.zeros(embedding_size)
    num_vectors = 0
    for sentence in sentences:
        tokens = re.sub(r"[^a-z0-9]+", " ", sentence.lower()).split()
        for token in tokens:
            if token not in model_ted.wv:
                continue
            sentence_vector += model_ted.wv[token]
            num_vectors += 1
    if num_vectors > 0:
        encoded_talks.append(sentence_vector/num_vectors)
        labels.append(labels_raw[index])

## Train Model

In [None]:
x_train = np.array(encoded_talks[:1700])
y_train = np.array(labels[:1700])

x_test = np.array(encoded_talks[1700:])
y_test = np.array(labels[1700:])

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)


In [None]:
import tensorflow
from tensorflow.keras import backend as K

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Dropout
from tensorflow.keras.optimizers import SGD

num_cores = 16

CPU = False
GPU = True

if GPU:
    num_GPU = 1
    num_CPU = 1
if CPU:
    num_CPU = 1
    num_GPU = 0

config = tensorflow.ConfigProto(intra_op_parallelism_threads=num_cores,\
        inter_op_parallelism_threads=num_cores, allow_soft_placement=True,\
        device_count = {'CPU' : num_CPU, 'GPU' : num_GPU})
session = tensorflow.Session(config=config)
K.set_session(session)

model = Sequential()
model.add(Dense(128, activation='relu', input_dim=embedding_size))
model.add(Dense(64, activation='relu'))
model.add(Dense(8, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train, epochs=1200, batch_size=512)


In [None]:
model.evaluate(x=x_test, y=y_test)