In [1]:
import numpy as np
import os
from random import shuffle
import re

In [2]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [3]:
import urllib.request
import lxml.etree

In [4]:
file = open('ted_en-20160408.xml', 'r', encoding="utf8")

In [5]:
doc = lxml.etree.parse(file)
input_text = '\n'.join(doc.xpath('//content/text()'))
del doc

In [6]:
i = input_text.find("Hyowon Gweon: See this?")
input_text[i-20:i+150]

' baby does.\n(Video) Hyowon Gweon: See this? (Ball squeaks) Did you see that? (Ball squeaks) Cool. See this one? (Ball squeaks) Wow.\nLaura Schulz: Told you. (Laughs)\n(Vide'

In [7]:
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)

In [8]:
i = input_text_noparens.find("Hyowon Gweon: See this?")
input_text_noparens[i-20:i+150]

"hat the baby does.\n Hyowon Gweon: See this?  Did you see that?  Cool. See this one?  Wow.\nLaura Schulz: Told you. \n HG: See this one?  Hey Clara, this one's for you. You "

In [9]:
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
    sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)

# Uncomment if you need to save some RAM: these strings are about 50MB.
# del input_text, input_text_noparens

# Let's view the first few:
sentences_strings_ted[:5]

["Here are two reasons companies fail: they only do more of the same, or they only do what's new",
 'To me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation',
 ' Both are necessary, but it can be too much of a good thing',
 'Consider Facit',
 " I'm actually old enough to remember them"]

In [10]:
sentences_ted = []
for sent_str in sentences_strings_ted:
    tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
    sentences_ted.append(tokens)

In [11]:
len(sentences_ted)

266694

In [12]:
print(sentences_ted[0])
print(sentences_ted[1])

['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new']
['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']


In [13]:
words_ted = [item for sublist in sentences_ted for item in sublist]
freq={}
for w in words_ted:
    if w in freq:
        freq[w] += 1
    else:
        freq[w] = 1

In [14]:
top1000 = sorted(freq, key = lambda x: int(freq[x]), reverse = True)
top1000 = top1000[:1000]

counts_ted_top1000 = []
for i in top1000:
    counts_ted_top1000.append(freq[i])

In [15]:
hist, edges = np.histogram(counts_ted_top1000, density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Top-1000 words distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

In [17]:
from gensim.models import Word2Vec

In [18]:
model1 = Word2Vec(sentences_ted, min_count=10)

In [19]:
model1.most_similar("man")

[('woman', 0.8370866179466248),
 ('guy', 0.8027213215827942),
 ('lady', 0.772685170173645),
 ('girl', 0.7476345300674438),
 ('boy', 0.7447001934051514),
 ('soldier', 0.7234354615211487),
 ('gentleman', 0.7049674987792969),
 ('poet', 0.6929847002029419),
 ('kid', 0.6863695979118347),
 ('rabbi', 0.6764403581619263)]

In [20]:
model1.most_similar("computer")

[('machine', 0.7308882474899292),
 ('robot', 0.7239102125167847),
 ('software', 0.7124124765396118),
 ('device', 0.6820383071899414),
 ('program', 0.6371703743934631),
 ('3d', 0.6336008310317993),
 ('chip', 0.6303318738937378),
 ('interface', 0.6233747005462646),
 ('camera', 0.6195968389511108),
 ('simulation', 0.616118311882019)]

In [21]:
model1.most_similar(positive=['flower', 'tree'], negative=['ground'])

[('neuron', 0.7101984024047852),
 ('creature', 0.6920633912086487),
 ('photo', 0.6713805198669434),
 ('bird', 0.6547281742095947),
 ('section', 0.6507145762443542),
 ('bacterium', 0.647260844707489),
 ('bee', 0.6467205286026001),
 ('cage', 0.6423370242118835),
 ('dolphin', 0.642247200012207),
 ('tube', 0.6411620378494263)]

In [22]:
words_top_vec_ted = model1[top1000]

In [23]:
print(words_top_vec_ted)

[[-0.36918798 -1.32064271  0.90669852 ...,  0.92583233 -2.86785889
  -0.01630816]
 [-0.1027891  -0.03383823 -0.21138939 ...,  0.93501687 -0.99886638
  -1.90030503]
 [-0.23670186  0.32994321  0.79477692 ..., -0.31519613  1.17926311
  -0.47619048]
 ..., 
 [-1.19546926  0.40297848  0.72879303 ...,  1.10878217 -0.53809792
  -1.14782846]
 [-1.18012702 -0.27771175  1.21320963 ...,  0.28534794 -1.4027549
  -0.51754224]
 [ 1.16119611  1.04694653 -0.85677665 ...,  0.77769578 -0.73290694
  -1.1953845 ]]


In [24]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
top1000_tsne = tsne.fit_transform(words_top_vec_ted)

In [25]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=top1000_tsne[:,0],
                                    x2=top1000_tsne[:,1],
                                    names=top1000))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

In [17]:
input_text = str(open('wiki.train.raw', 'r').read())
print(input_text[:1000])


sentences_wiki = []
for line in input_text.split('\n'):
    s = [x for x in line.split('.') if x and len(x.split()) >= 5]
    sentences_wiki.extend(s)
    
for s_i in range(len(sentences_wiki)):
    sentences_wiki[s_i] = re.sub("[^a-z]", " ", sentences_wiki[s_i].lower())
    sentences_wiki[s_i] = re.sub(r'\([^)]*\)', '', sentences_wiki[s_i])
del input_text

 
 = Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . 
 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series 

In [19]:
n_sentences_wiki = []
for sent_str in sentences_wiki:
    n_sentences_wiki.append(sent_str.split())

# sample 1/5 of the data
shuffle(n_sentences_wiki)
print(len(n_sentences_wiki))
n_sentences_wiki = n_sentences_wiki[:int(len(n_sentences_wiki)/5)]
print(len(n_sentences_wiki))

1429102
285820


In [20]:
freqs = {}
for sentence in n_sentences_wiki:
    for word in sentence:
        if word in freqs:
            freqs[word] += 1
        else:
            freqs[word] = 1

In [21]:
words_top_wiki = sorted(freqs, key=lambda x: int(freqs[x]), reverse=True)
counts_wiki_top1000 = [freqs[word] for word in words_top_wiki[:1000]]

print(words_top_wiki[:10])
print(counts_wiki_top1000[:10])

['the', 'of', 'and', 'in', 'to', 'a', 'was', 's', 'on', 'as']
[429871, 183856, 167962, 145492, 133052, 120971, 71687, 55047, 51595, 48557]


In [24]:
hist, edges = np.histogram(counts_wiki_top1000, density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Top-1000 words distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

In [25]:
model2 = Word2Vec(min_count = 5, size=100, sentences = n_sentences_wiki)

In [26]:
model2.most_similar('boy')

[('protagonist', 0.7549673914909363),
 ('girl', 0.7314180135726929),
 ('dog', 0.7259712219238281),
 ('man', 0.705185055732727),
 ('dragon', 0.7040812373161316),
 ('hero', 0.7023768424987793),
 ('kid', 0.6906249523162842),
 ('monster', 0.6857444047927856),
 ('baby', 0.6763310432434082),
 ('batman', 0.6622106432914734)]

In [27]:
model2.most_similar('flower')

[('coloured', 0.8598988056182861),
 ('metallic', 0.8580095767974854),
 ('winged', 0.8567759990692139),
 ('pale', 0.8539287447929382),
 ('tiny', 0.8502558469772339),
 ('spore', 0.8488335609436035),
 ('walled', 0.8438517451286316),
 ('olive', 0.8370686173439026),
 ('slender', 0.8330860137939453),
 ('buff', 0.832647442817688)]

In [28]:
model2.most_similar(positive=['man', 'boy'], negative=['girl'])

[('protagonist', 0.6099501848220825),
 ('hero', 0.6006561517715454),
 ('sword', 0.5794966816902161),
 ('dragon', 0.5587184429168701),
 ('batman', 0.5460250973701477),
 ('character', 0.5434754490852356),
 ('doctor', 0.5370021462440491),
 ('player', 0.5330149531364441),
 ('robot', 0.5314503312110901),
 ('master', 0.5190551280975342)]

In [29]:
# This assumes words_top_wiki is a list of strings, the top 1000 words
words_top_vec_wiki = model2[words_top_wiki[:1000]]
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_wiki_tsne = tsne.fit_transform(words_top_vec_wiki)

In [30]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_top_wiki_tsne[:,0],
                                    x2=words_top_wiki_tsne[:,1],
                                    names=words_top_wiki))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

