In [1]:
# The goal is to create word vectors from GOT book
# and analyse them to see semantic similarity
# 
from __future__ import absolute_import, division, print_function
# for word encoding
import codecs
# regex
import glob
# concurrency
import multiprocessing
# dealing with the OS, like reading a file
import os
# pretty printing human readable
import pprint
# reguular exptession
import re
# natuural language toolkit
import nltk
# word 2 vec
import gensim.models.word2vec as w2v
# dimensionality reduuction
import sklearn.manifold
# math
import numpy as np
# plotting
import matplotlib.pyplot as plt
#parse pandas as pd
import pandas as pd
# visualization 
import seaborn as sns

In [2]:
# Step 1 - Process our data
# clean data
nltk.download('punkt') # Pretrained tokenizer
nltk.download('stopwords') # Words like and, the, an, a, of

[nltk_data] Error loading punkt: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>
[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


False

In [3]:
# get the book names , matching txt file
# https://github.com/llSourcell/word_vectors_game_of_thrones-LIVE/tree/master/data
book_filenames = sorted(glob.glob("*.txt"))
book_filenames

['got1.txt', 'got2.txt', 'got3.txt', 'got4.txt', 'got5.txt']

In [4]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
        
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Reading 'got1.txt'...
Corpus is now 1770659 characters long

Reading 'got2.txt'...
Corpus is now 4071041 characters long

Reading 'got3.txt'...
Corpus is now 6391405 characters long

Reading 'got4.txt'...
Corpus is now 8107945 characters long

Reading 'got5.txt'...
Corpus is now 9719485 characters long



In [None]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [None]:
# Convert into list of words
# remove unnecessary, split into words, no hyphens
# list of words
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]", " ", raw)
    words = clean.split()
    return words

In [None]:
# Sentencess where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [None]:
print(raw_sentences[6030])
print(sentence_to_wordlist(raw_sentences[6030]))

In [None]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

In [None]:
# Train Word2Vec
# once we have vectors 
# STEP 3 Build Model
# 3 main tasks that vectors help with 
# DISTANCE, SIMILARITY, RANKING

# Dimensionality of the resulting word vectors
# more dimensions, more computationally expensive to train
# but alsp more accurate
# more dimensions = more generalized
num_features = 300
# Minimum word count threshold
min_word_count = 3

# Number of threads to run in parallel
# more workers, fster we train
num_workers = multiprocessing.cpu_count()

# Context windown length
context_size = 7

# Downsample setting for freqeunt workds
# 0 - 1e-5 is good for thsi
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible
# Random number generatior
# deterministic, good for debugging
seed = 1

In [None]:
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [None]:
thrones2vec.build_vocab(sentences)

In [None]:
print("Word2Vec vocabulary length:", len(thrones2vec.wv.vocab))

In [None]:
thrones2vec.train(sentences)

In [None]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [None]:
thrones2vec.save(os.path.join("trained", "thrones2vec.w2v"))

In [None]:
# Explore the trained model
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "thrones2vec.w2v"))

In [None]:
# Compress the word vector into 2D space and plot them
# Visit How to visualize a dataset easily
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)

In [None]:
all_word_vectors_matrix = thrones2vec.wv.syn0

In [None]:
# Train t-SNE, this could take a minute or two
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

In [None]:
# Plot the big picture
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[thrones2vec.wv.vocab[word].index])
            for word in thrones2vec.wv.vocab
        ]
    ],
    columns = ["word", "x", "y"]
)

In [None]:
points.head(10)

In [None]:
sns.set_context("poster")

In [None]:
points.plot.scatter("x", "y", s=10, figsize=(20, 12))
sns.plt.show()

In [None]:
# Zoom in to some interesting places
def plot_region(x_bounds, y_bounds):
    slice = points[
        (x_bounds[0] <= points.x) &
        (points.x <= x_bounds[1]) &
        (y_bounds[0] <= points.y) &
        (points.y <= y_bounds[1])
    ]
    
    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
    for i, point in slice.iterrows():
        ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)

In [None]:
# People related to Kingsguard ended up together
plot_region(x_bounds = (4.0, 4.2), y_bounds=(-0.5, -0.1))

In [None]:
# Food products are grouped nicely as well. Aerys - The mad king, being chlose to "roased" also looks sadly correct
plot_region(x_bound=(0, 1), y_bounds=(4, 4.5))

In [None]:
# Explore semantic similarities between book characters
# Words closest to the given word
thrones2vec.wv.most_similar("Stark")

In [None]:
thrones2vec.wv.most_similar("Aerys")

In [None]:
thrones2vec.wv.most_similar("direwolf")

In [None]:
# Linear relationship between word pairs
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = thrones2vec.most_similar_cosmul(
        positive = [end2, start1],
        negative = [end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2}  is related to {end2}".format(**locals()))
    return start2

In [None]:
nearest_similarity_cosmul("Stark", "Winterfell", "Riverrun")
nearest_similarity_cosmul("Jaime", "sword", "wine")
nearest_similarity_cosmul("Arya", "Nymeria", "dragons")

In [None]:
nearest_similarity_cosmul("Lyanna", "Arya", "Sansa")