In [4]:
#All the data will be treated as one big corpus. Although I have it in 5 sections but I will use it as the same.
#Word2vec uses the similarity according to the high in the cosine values.
from __future__ import absolute_import, division, print_function
#to encode words
import codecs
#regex searches files real fast
import glob
#concurrency runs multithreader (fast run)
import multiprocessing
#dealing with OS
import os
#human readability
import pprint
#regular expression duh..
import re
#NLTK!!!!
import nltk
#word2vec
import gensim.models.word2vec as w2v
#dimensional reduction to visualize dataset
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [5]:
#preprocessing the data
nltk.download('punkt') #pretrained tokenizer
nltk.download('stopwords') #removes useless words the, and

[nltk_data] Downloading package punkt to /Users/vijitvm/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vijitvm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
book_filenames = sorted(glob.glob("*.txt"))
book_filenames


['got1.txt', 'got2.txt', 'got3.txt', 'got4.txt', 'got5.txt']

In [8]:
#here i am combining the corpus of all the files that I have as the data. 'u' stands for unicode.
#at each iteration I print the siz of the total corpus.
#Use codecs to read the file and convert it in utf-8
##### Why? Codecs encodes the text file as a unicode text file
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("length of corpus is = {0}".format(len(corpus_raw)))
    print()

Reading 'got1.txt'...
length of corpus is = 1770659

Reading 'got2.txt'...
length of corpus is = 4071041

Reading 'got3.txt'...
length of corpus is = 6391405

Reading 'got4.txt'...
length of corpus is = 8107945

Reading 'got5.txt'...
length of corpus is = 9719485



In [10]:
#tokenize the corpus to sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(corpus_raw)


In [11]:
#no hyphens and other shit
def sentowords(raw) :
    clean = re.sub("[^a-zA-Z]", " ", raw)
    words = clean.split()
    return words

In [12]:
#generates the sentences
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) >0 :
        sentences.append(sentowords(raw_sentence))


In [19]:
#check out this example
print(raw_sentences[26])
print(sentowords(raw_sentences[26]))

He leaned against the battlement, the sea crashing beneath him, the black stone rough beneath his fingers.
['He', 'leaned', 'against', 'the', 'battlement', 'the', 'sea', 'crashing', 'beneath', 'him', 'the', 'black', 'stone', 'rough', 'beneath', 'his', 'fingers']


In [22]:
number_of_tokens = sum([len(sentence) for sentence in sentences])
print("tokens = {0}".format(number_of_tokens))

tokens = 1818103


## TRAIN THE WORD2VEC


In [58]:
#Tasks the word vector can do are
# 1. Similarity
# 2. Distance 
# 3. Ranking
# Lets take the dimensionality of resulting word vectors to be 300
# can increase for accuracy
num_features = 300

#min word count threshold
min_word_count = 3

#number of parallel threads
num_workers = multiprocessing.cpu_count()

#context window size
context_size = 7

#downsample for too frequent words
downsampling = 1e-3

#seeding for random number generator
seedsize = 1


In [64]:
GOT_vectors = w2v.Word2Vec(sg = 1,
            seed = 1,
            workers = num_workers,
            size = num_features,
            min_count = min_word_count,
            window = context_size,
            sample = downsampling) 


GOT_vectors.build_vocab(sentences)


GOT_vectors.train(sentences, total_examples=number_of_tokens, epochs=5)

(7023229, 9090515)

## Saving The Model

In [65]:
if not os.path.exists("trainedmodel"):
    os.makedirs("trainedmodel")

In [67]:
GOT_vectors.save(os.path.join("trainedmodel", "GOT_vectors.w2v"))

## Load the model


In [81]:
GOT_vectors = w2v.Word2Vec.load(os.path.join("trainedmodel", "GOT_vectors.w2v"))

## Compression and plots

In [90]:
#we use tsne here
# tsne = t stochastic neighbor embedding
#it takes the 300 dim vectors that we trained and then compresses them into 2 dims
from gensim import models


tsne = sklearn.manifold.TSNE(n_components = 2, random_state = 0)
all_word_vectors_matrix = GOT_vectors.wv.vectors


## Train t-SNE

In [91]:
word_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

In [188]:
points = pd.DataFrame(
    [ (word, coords[0], coords[1])
         for word, coords in [
             (word, word_matrix_2d[GOT_vectors.wv.vocab[word].index])
             for word in GOT_vectors.wv.vocab
         ]   
    ],
    columns = ["word", "x", "y"]
)

In [189]:
points.head(20)

Unnamed: 0,word,x,y
0,This,22.751398,-35.462864
1,edition,-56.527988,-13.052814
2,the,-24.505066,12.11822
3,complete,-8.919771,-8.757134
4,of,-26.291798,-24.792862
5,original,-3.037655,-1.821691
6,hardcover,-51.019245,-9.692529
7,ONE,-49.584076,-20.405176
8,A,20.821941,-5.962793
9,OF,-59.338356,-15.286861


In [244]:
sns.set_context("poster")
points.plot.scatter("x", "y", s=10, figsize=(10,8))

<matplotlib.axes._subplots.AxesSubplot at 0x1a26544b00>

In [150]:
def plot_region(x_bounds, y_bounds):
    slice = points[
        (x_bounds[0] <= points.x) &
        (points.x <= x_bounds[1]) & 
        (y_bounds[0] <= points.y) &
        (points.y <= y_bounds[1])
    ]
    
    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
    for i, point in slice.iterrows():
        ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)
        

In [151]:
plot_region(x_bounds=(4.0, 4.2), y_bounds=(-0.5, -0.1))

ValueError: scatter requires x column to be numeric

In [152]:
plot_region(x_bounds=(0, 1), y_bounds=(4, 4.5))

ValueError: scatter requires x column to be numeric

In [238]:
GOT_vectors.wv.most_similar("Varys")

[('Littlefinger', 0.7106744050979614),
 ('Spider', 0.6845917701721191),
 ('thinly', 0.663819432258606),
 ('politely', 0.6552637815475464),
 ('Qyburn', 0.6421846151351929),
 ('wanly', 0.6421177387237549),
 ('crookedly', 0.639447808265686),
 ('Pycelle', 0.638593316078186),
 ('Ghael', 0.6384761333465576),
 ('wicked', 0.6349178552627563)]

In [155]:
GOT_vectors.wv.most_similar("Aerys")

[('Jaehaerys', 0.7526637315750122),
 ('Mad', 0.7265937924385071),
 ('II', 0.6934542655944824),
 ('Conciliator', 0.6877614259719849),
 ('Daeron', 0.6836853623390198),
 ('Unworthy', 0.6712086796760559),
 ('Beggar', 0.6679360866546631),
 ('reign', 0.6677627563476562),
 ('appointment', 0.6672744750976562),
 ('Maekar', 0.6670339107513428)]

In [156]:
GOT_vectors.wv.most_similar("direwolf")

[('wolf', 0.6757630109786987),
 ('Ghost', 0.6407413482666016),
 ('pup', 0.6214362382888794),
 ('Rickon', 0.6112809181213379),
 ('Stark', 0.6073806881904602),
 ('SHAGGYDOG', 0.6057423949241638),
 ('muzzle', 0.6027623414993286),
 ('Shaggydog', 0.5991887450218201),
 ('eagle', 0.5926990509033203),
 ('wight', 0.5774366855621338)]

In [159]:
#distance, similarity, and ranking
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = GOT_vectors.wv.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [243]:
nearest_similarity_cosmul("Stark", "Robb", "Jon")
nearest_similarity_cosmul("sword", "Jaime", "Jon")
nearest_similarity_cosmul("Jaime", "Tyrion", "Bran")

Stark is related to Robb, as Snow is related to Jon
sword is related to Jaime, as Longclaw is related to Jon
Jaime is related to Tyrion, as Rickon is related to Bran


'Rickon'