In [43]:
import numpy as np
import os
from random import shuffle
import re

In [44]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [45]:
import urllib.request
import zipfile
#from lxml import etree
import lxml.etree

In [46]:
# Download the dataset 
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

In [47]:
# For now, we're only interested in the subtitle text, so let's extract that from the XML:

with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
input_text = '\n'.join(doc.xpath('//content/text()'))
del doc

In [48]:
i = input_text.find("Hyowon Gweon: See this?")
input_text[i-20:i+150]

' baby does.\n(Video) Hyowon Gweon: See this? (Ball squeaks) Did you see that? (Ball squeaks) Cool. See this one? (Ball squeaks) Wow.\nLaura Schulz: Told you. (Laughs)\n(Vide'

In [49]:
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)

In [50]:
i = input_text_noparens.find("Hyowon Gweon: See this?")
input_text_noparens[i-20:i+150]

"hat the baby does.\n Hyowon Gweon: See this?  Did you see that?  Cool. See this one?  Wow.\nLaura Schulz: Told you. \n HG: See this one?  Hey Clara, this one's for you. You "

In [51]:
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)

In [52]:
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
    sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)

# view the first few:
sentences_strings_ted[:5]

["Here are two reasons companies fail: they only do more of the same, or they only do what's new",
 'To me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation',
 ' Both are necessary, but it can be too much of a good thing',
 'Consider Facit',
 " I'm actually old enough to remember them"]

In [53]:
sentences_ted = []
for sent_str in sentences_strings_ted:
    tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
    sentences_ted.append(tokens)

In [54]:
len(sentences_ted)

266694

In [55]:
print(sentences_ted[0])
print(sentences_ted[1])

['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new']
['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']


In [56]:
import collections
c=collections.Counter()
for s in sentences_ted:
    for word in s:
        c[word]+=1

words_top_ted=list(k[0] for k in c.most_common(1000))
counts_ted_top1000=list(k[1] for k in c.most_common(1000))


In [57]:
hist, edges = np.histogram(counts_ted_top1000, density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Top-1000 words distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

In [94]:
from gensim.models import Word2Vec

In [61]:
ted_model = Word2Vec(sentences_ted, size=100, window=5, min_count=5, workers=4)

In [95]:
ted_model.save('ted_model')

In [96]:
ted_model=Word2Vec.load('ted_model')

In [97]:
ted_model.most_similar("man")

[('woman', 0.8307706117630005),
 ('guy', 0.8081215620040894),
 ('lady', 0.7646480798721313),
 ('boy', 0.7446976900100708),
 ('girl', 0.7293294668197632),
 ('gentleman', 0.7206354737281799),
 ('soldier', 0.6900509595870972),
 ('poet', 0.6793301105499268),
 ('david', 0.6725543737411499),
 ('surgeon', 0.6675092577934265)]

In [67]:
ted_model.most_similar("computer")


[('machine', 0.7482309341430664),
 ('software', 0.7177109718322754),
 ('robot', 0.6981737613677979),
 ('device', 0.6796837449073792),
 ('camera', 0.6488869786262512),
 ('3d', 0.6460374593734741),
 ('mechanical', 0.641261875629425),
 ('code', 0.6328430771827698),
 ('interface', 0.6326004266738892),
 ('video', 0.6314619779586792)]

In [68]:
ted_model.most_similar('house')

[('seat', 0.8153576254844666),
 ('office', 0.7878681421279907),
 ('door', 0.7811933755874634),
 ('shop', 0.7807976603507996),
 ('apartment', 0.7781676054000854),
 ('hut', 0.7697461843490601),
 ('chair', 0.767337441444397),
 ('kitchen', 0.7658503651618958),
 ('bed', 0.7632778286933899),
 ('town', 0.75962233543396)]

In [69]:
# This assumes words_top_ted is a list of strings, the top 1000 words
words_top_vec_ted = model_ted[words_top_ted]

In [70]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(words_top_vec_ted)

In [71]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_top_ted))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

In [72]:
if not os.path.isfile('wikitext-103-raw-v1.zip'):
    urllib.request.urlretrieve("https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip", filename="wikitext-103-raw-v1.zip")

In [73]:
with zipfile.ZipFile('wikitext-103-raw-v1.zip', 'r') as z:
    input_text = str(z.open('wikitext-103-raw/wiki.train.raw', 'r').read(), encoding='utf-8') # Thanks Robert Bastian

In [74]:
sentences_wiki = []
for line in input_text.split('\n'):
    s = [x for x in line.split('.') if x and len(x.split()) >= 5]
    sentences_wiki.extend(s)
    
for s_i in range(len(sentences_wiki)):
    sentences_wiki[s_i] = re.sub("[^a-z]", " ", sentences_wiki[s_i].lower())
    sentences_wiki[s_i] = re.sub(r'\([^)]*\)', '', sentences_wiki[s_i])
del input_text

In [75]:
# sample 1/5 of the data
shuffle(sentences_wiki)
print(len(sentences_wiki))
sentences_wiki = sentences_wiki[:int(len(sentences_wiki)/5)]
print(len(sentences_wiki))

4267112
853422


In [76]:
import collections
c=collections.Counter()
for s in sentences_ted:
    for word in s:
        c[word]+=1
#print (c.most_common(1000))
words_top_ted=list(k[0] for k in c.most_common(1000))
counts_wiki_top1000=list(k[1] for k in c.most_common(1000))

In [77]:
hist, edges = np.histogram(counts_wiki_top1000, density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Top-1000 words distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

In [89]:
#train word2wec
wiki_model = Word2Vec(sentences_wiki, size=100, window=5, min_count=5, workers=4)

In [109]:
wiki_model.save('wiki_model')

In [110]:
wiki_model=Word2Vec.load('wiki_model')

In [111]:
wiki_model.most_similar("man")

KeyError: "word 'man' not in vocabulary"

In [112]:
wiki_model.most_similar("computer")

KeyError: "word 'computer' not in vocabulary"

In [113]:
wiki_model.most_similar("house")

KeyError: "word 'house' not in vocabulary"

In [116]:
# This assumes words_top_wiki is a list of strings, the top 1000 words
words_top_vec_wiki = wiki_model[words_top_wiki]

tsne = TSNE(n_components=2, random_state=0)
words_top_wiki_tsne = tsne.fit_transform(words_top_vec_wiki)

NameError: name 'words_top_wiki' is not defined

In [None]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_top_wiki_tsne[:,0],
                                    x2=words_top_wiki_tsne[:,1],
                                    names=words_top_wiki))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)