In [31]:
import re
import urllib.request
import zipfile
from lxml import etree

import tensorflow as tf
import keras

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# Data Preprocessing

In [4]:
# download raw data
download_path = "../data/"
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/09.%20Word%20Embedding/dataset/ted_en-20160408.xml", filename=download_path+"ted_en-20160408.xml")

('../data/ted_en-20160408.xml', <http.client.HTTPMessage at 0x2a636bc90>)

In [15]:
# load xml file with tree structure and parse text
xml = open(download_path+"ted_en-20160408.xml", "r", encoding="UTF8")
xml = etree.parse(xml)
parse_text = '\n'.join(xml.xpath('//content/text()'))
content_text = re.sub(r'\([^)]*\)', '', parse_text) # remove texts in parentheses

### etree
The ``xpath`` method is used to query the XML structure. 
This particular XPath query ``//content/text()`` selects the text content of all <content> tags in the XML document.

### re
``re.sub(pattern, replacement, string)``
The re.sub function is used to find all substrings in a given string ``string`` that match a particular pattern ``pattern`` and then replace them with a different string ``replacement``.


In [28]:
# tokenize sentences
nltk.download('punkt') # download punkt tokenizer
# sentence tokenization : sent_text is list of sentences
sent_text = sent_tokenize(content_text) 
# normalize sentences : remove all except alphabet and number
sent_text_normalized = []
for s in sent_text:
    token = re.sub("[^a-zA-Z0-9]", " ", s.lower()) 
    sent_text_normalized.append(token)
    
print(sent_text_normalized[:3])

[nltk_data] Downloading package punkt to /Users/godpeny/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['here are two reasons companies fail  they only do more of the same  or they only do what s new ', 'to me the real  real solution to quality growth is figuring out the balance between two activities  exploration and exploitation ', 'both are necessary  but it can be too much of a good thing ']


In [30]:
# tokenize words
word_tokenized_text = [word_tokenize(s) for s in sent_text_normalized]
print(len(word_tokenized_text)) # number of sentences with tokenized words
print(word_tokenized_text[:3])

273424
[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'], ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation'], ['both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'a', 'good', 'thing']]


# Modeling

In [33]:
model = Word2Vec(sentences=word_tokenized_text, vector_size=100, window=5, min_count=5, workers=4, sg=0)

### Word2Vec
``Word2Vec(sentences, size, window, min_count, workers, sg)``
- sentences : list of sentences
- vector_size : dimension of embedding vector
- window : window size of context words
- min_count : minimum frequency of words (if less than min_count, ignore)
- workers : number of threads
- sg : 0 for ``CBOW``, 1 for ``Skip-gram``

In [34]:
# show similar words
print(model.wv.most_similar("girl"))

[('boy', 0.9291726350784302), ('woman', 0.833076000213623), ('lady', 0.818193793296814), ('kid', 0.8125599026679993), ('baby', 0.7507525682449341), ('man', 0.7321241497993469), ('sister', 0.698284924030304), ('soldier', 0.6746154427528381), ('daughter', 0.6730027794837952), ('mary', 0.6709443926811218)]


In [36]:
# save model
model_path = "../model/"
model.wv.save_word2vec_format(model_path+"eng_w2v")

In [38]:
# load model
loaded_model = KeyedVectors.load_word2vec_format(model_path+"eng_w2v")
print(loaded_model.most_similar("dog"))

[('cat', 0.802236020565033), ('chair', 0.7937402129173279), ('leg', 0.7689682245254517), ('hat', 0.7485133409500122), ('mom', 0.7483894228935242), ('doctor', 0.7459036111831665), ('seat', 0.7310882210731506), ('uncle', 0.7277305722236633), ('wrist', 0.7218130230903625), ('nose', 0.7200853228569031)]
