In [11]:
# step 0: import dependencies
from __future__ import absolute_import, division,print_function
import multiprocessing
import glob
import codecs # for word enconding
import os
import pprint
import re # regular expression
import nltk # natural language toolkit
import gensim.models.word2vec as w2v # word2vec
import sklearn.manifold # dimensionality reduction

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [7]:
# step 1: process data
nltk.download('punkt') # pretrained tokenizer
nltk.download('stopwords') # words like and, the, an, a

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hli378\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hli378\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
# get the book names, matching txt file
book_filenames=sorted(glob.glob('*.txt'))

In [12]:
book_filenames

['got1.txt', 'got2.txt', 'got3.txt', 'got4.txt', 'got5.txt']

# Combine the books into one string

In [42]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Reading 'got1.txt'...
Corpus is now 1770659 characters long

Reading 'got2.txt'...
Corpus is now 4071041 characters long

Reading 'got3.txt'...
Corpus is now 6391405 characters long

Reading 'got4.txt'...
Corpus is now 8107945 characters long

Reading 'got5.txt'...
Corpus is now 9719485 characters long



In [43]:
tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')

In [44]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [45]:
#convert into a list of words
#remove unnnecessary,, split into words, no hyphens
#list of words
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [46]:
#sentence where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [48]:
sentences[:2]

[['This',
  'edition',
  'contains',
  'the',
  'complete',
  'text',
  'of',
  'the',
  'original',
  'hardcover',
  'edition'],
 ['NOT', 'ONE', 'WORD', 'HAS', 'BEEN', 'OMITTED']]

# train word2vec

In [49]:
# three main tasks
'''
distance
similarity
ranking
'''

'\ndistance\nsimilarity\nranking\n'

In [55]:
num_features=300 # dimensionality of the resulting word vectors
min_word_count=3 # minimum number of words threshold
num_workers=multiprocessing.cpu_count() # # threads in parallel
context_size=7 # context window length???
downsampling=1e-3 # downsample setting for frequent words

# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1

In [84]:
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [85]:
thrones2vec.build_vocab(sentences)

In [86]:
print("Word2Vec vocabulary length:", len(thrones2vec.wv.vocab))

Word2Vec vocabulary length: 17277


# Start training, this might take a minute or two...

In [87]:
thrones2vec.train(sentences, total_examples=thrones2vec.corpus_count, epochs=thrones2vec.epochs)

(7024657, 9090515)

# Save to file, can be useful later

In [88]:
if not os.path.exists("trained"):
    os.makedirs("trained")
thrones2vec.save(os.path.join("trained", "thrones2vec.w2v"))

In [89]:
thrones2vec.most_similar("Stark")

  if __name__ == '__main__':
  if np.issubdtype(vec.dtype, np.int):


[('Eddard', 0.7453762292861938),
 ('Winterfell', 0.6407965421676636),
 ('beheaded', 0.6220446825027466),
 ('Brandon', 0.6193771362304688),
 ('Hornwood', 0.6130884289741516),
 ('executed', 0.6095959544181824),
 ('Lyanna', 0.6092574000358582),
 ('Robb', 0.6056458950042725),
 ('Arryn', 0.6056065559387207),
 ('Rickard', 0.5962651968002319)]