In [None]:
import re
import urllib.request
import zipfile
from lxml import etree

import tensorflow as tf
import keras

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

from gensim.models import FastText
from gensim.models import KeyedVectors

# Data Preprocessing

In [None]:
# use existing data
# load xml file with tree structure and parse text
data_path = "../data/"
xml = open(data_path+"ted_en-20160408.xml", "r", encoding="UTF8")
xml = etree.parse(xml)
parse_text = '\n'.join(xml.xpath('//content/text()'))
content_text = re.sub(r'\([^)]*\)', '', parse_text) # remove texts in parentheses

In [None]:
# tokenize sentences
nltk.download('punkt') # download punkt tokenizer
# sentence tokenization : sent_text is list of sentences
sent_text = sent_tokenize(content_text)
# normalize sentences : remove all except alphabet and number
sent_text_normalized = []
for s in sent_text:
    token = re.sub("[^a-zA-Z0-9]", " ", s.lower())
    sent_text_normalized.append(token)

print(sent_text_normalized[:3])

In [10]:
# tokenize words
word_tokenized_text = [word_tokenize(s) for s in sent_text_normalized]
print(len(word_tokenized_text)) # number of sentences with tokenized words
print(word_tokenized_text[:3])

273424
[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'], ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation'], ['both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'a', 'good', 'thing']]


# Modeling

In [11]:
model = FastText(sentences=word_tokenized_text, window=5, min_count=5, workers=4, sg=1)

In [12]:
# Inference
model.wv.most_similar("electrofishing")

[('electrolux', 0.8682365417480469),
 ('electrolyte', 0.8672255873680115),
 ('electroshock', 0.8495044112205505),
 ('electro', 0.847086489200592),
 ('electrochemical', 0.8444557785987854),
 ('airbus', 0.8321382403373718),
 ('electroencephalogram', 0.8314631581306458),
 ('airbag', 0.8235702514648438),
 ('electrogram', 0.817474901676178),
 ('electromagnet', 0.8151991963386536)]