In [None]:
import numpy as np
import os
from random import shuffle
import re

In [None]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [None]:
import urllib.request
import zipfile
#from lxml import etree
import lxml.etree

In [None]:
# Download the dataset if it's not already there: this may take a minute as it is 75MB
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

In [None]:
# For now, we're only interested in the subtitle text, so let's extract that from the XML:
#with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
#doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
doc = lxml.etree.parse('ted_en-20160408.xml')
input_text = '\n'.join(doc.xpath('//content/text()'))
del doc

In [None]:
i = input_text.find("Hyowon Gweon: See this?")
input_text[i-20:i+150]

In [None]:
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)

In [None]:
i = input_text_noparens.find("Hyowon Gweon: See this?")
input_text_noparens[i-20:i+150]

In [None]:
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
    sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)

# Uncomment if you need to save some RAM: these strings are about 50MB.
# del input_text, input_text_noparens

# Let's view the first few:
sentences_strings_ted[:5]

In [None]:
sentences_ted = []
for sent_str in sentences_strings_ted:
    tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
    sentences_ted.append(tokens)

In [None]:
len(sentences_ted)

In [None]:
print(sentences_ted[0])
print(sentences_ted[1])

In [None]:
counts = {}
for sentence in sentence_ted:
    for word in sentence:
        if word not in counts:
            counts[word] = 1
        else:
            counts[word] += 1

words_ted_top1000 = sorted(counts, reverse=True, key = counts.__getitem__)
counts_ted_top1000 = []
for word in words_ted_top1000[:1000]:
    counts_ted_top1000.append(counts_ted[word])
print(counts_ted_top1000[:3])
    

In [None]:
hist, edges = np.histogram(counts_ted_top1000, density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Top-1000 words distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

In [None]:
from gensim.models import Word2Vec

In [None]:
model_ted = Word2Vec(sentences_ted, size=100, window=5, min_count=5, workers=4)

In [None]:
model_ted.most_similar("man")

In [None]:
model_ted.most_similar("computer")

In [None]:
model_ted.most_similar("horse")

In [None]:
#tsne
words_top_vec_ted = model_ted[words_ted_top1000]

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(words_top_vec_ted)

In [None]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_top_ted))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

In [None]:
#WIKI LEARNT REPRESANTATIONS
#if not os.path.isfile('wikitext-103-raw-v1.zip'):
#    urllib.request.urlretrieve("https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip", filename="wikitext-103-raw-v1.zip")

In [None]:
with zipfile.ZipFile('wikitext.zip', 'r') as z:
    input_text = str(z.open('wikitext-103-raw/wiki.train.raw', 'r').read(), encoding='utf-8') # Thanks Robert Bastian
#input_text = str(open('wikitext/wikitext-103-raw/wiki.train.raw', 'r').read(), encoding='utf-8')

In [None]:
sentences_wiki = []
for line in input_text.split('\n'):
    s = [x for x in line.split('.') if x and len(x.split()) >= 5]
    sentences_wiki.extend(s)
    
for s_i in range(len(sentences_wiki)):
    sentences_wiki[s_i] = re.sub("[^a-z]", " ", sentences_wiki[s_i].lower())
    sentences_wiki[s_i] = re.sub(r'\([^)]*\)', '', sentences_wiki[s_i])

In [None]:
# sample 1/5 of the data
shuffle(sentences_wiki)
print(len(sentences_wiki))
sentences_wiki = sentences_wiki[:int(len(sentences_wiki)/5)]
print(len(sentences_wiki))


In [None]:
counts_wiki = {}
for sentence in sentences_wiki:
    for word in sentence:
        if word in counts_wiki:
            counts_wiki[word] += 1
        else:
            counts_wiki[word] = 1
words_top_wiki = sorted(counts_wiki, reverse=True, key=counts_wiki.__getitem__)
counts_wiki_top1000 = [counts_wiki[word] for word in words_top_wiki[:1000]]
for word in words_top_wiki[:1000]:
    counts_wiki_top1000.append(counts_wiki[word])
print(counts_wiki_top1000[:3])

In [None]:
hist, edges = np.histogram(counts_wiki_top1000, density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Top-1000 words distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

In [None]:
#train word2wec
model_ted = Word2Vec(sentences_wiki, size=100, window=5, min_count=5, workers=4)

In [None]:
model_wiki.most_similar("man")

In [None]:
model_wiki.most_similar("computer")

In [None]:
model_wiki.most_similar("horse")

In [None]:
# This assumes words_top_wiki is a list of strings, the top 1000 words
words_top_vec_wiki = model_wiki[words_top_wiki]

tsne = TSNE(n_components=2, random_state=0)
words_top_wiki_tsne = tsne.fit_transform(words_top_vec_wiki)

In [None]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_top_wiki_tsne[:,0],
                                    x2=words_top_wiki_tsne[:,1],
                                    names=words_top_wiki))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)