In [1]:
from __future__ import absolute_import, division, print_function
import codecs
import glob
import multiprocessing
import os
import pprint
import re
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
path = os.getcwd() + '/' + "data/GOT_data/"
book_filenames = sorted(glob.glob(path + '*.txt'))
book_filenames

['/home/itachi/Natural_language_processing/data/GOT_data/got1.txt',
 '/home/itachi/Natural_language_processing/data/GOT_data/got2.txt',
 '/home/itachi/Natural_language_processing/data/GOT_data/got3.txt',
 '/home/itachi/Natural_language_processing/data/GOT_data/got4.txt',
 '/home/itachi/Natural_language_processing/data/GOT_data/got5.txt']

In [3]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, 'r', 'utf-8')as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Reading '/home/itachi/Natural_language_processing/data/GOT_data/got1.txt'...
Corpus is now 1770659 characters long

Reading '/home/itachi/Natural_language_processing/data/GOT_data/got2.txt'...
Corpus is now 4071041 characters long

Reading '/home/itachi/Natural_language_processing/data/GOT_data/got3.txt'...
Corpus is now 6391405 characters long

Reading '/home/itachi/Natural_language_processing/data/GOT_data/got4.txt'...
Corpus is now 8107945 characters long

Reading '/home/itachi/Natural_language_processing/data/GOT_data/got5.txt'...
Corpus is now 9719485 characters long



In [4]:
corpus_raw[:151]

u'This edition contains the complete text of the original hardcover edition.\n\nNOT ONE WORD HAS BEEN OMITTED.\n\nA CLASH OF KINGS\n\nA Bantam Spectra Book\n\nPU'

In [5]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [6]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [7]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]", " ",raw)
    words = clean.split()
    return words

In [8]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [9]:
print(raw_sentences[5])
print(sentence_to_wordlist(raw_sentences[5]))

Heraldic crest by Virginia Norey.
[u'Heraldic', u'crest', u'by', u'Virginia', u'Norey']


In [10]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus containes {0:,} tokens".format(token_count))

The book corpus containes 1,818,103 tokens


In [11]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300                              # Word vector dimensionality                      
min_word_count = 20                             # Minimum word count                        
num_workers = multiprocessing.cpu_count()       # Number of threads to run in parallel
context = 10                                    # Context window size                                                                                    
downsampling = 1e-3                             # Downsample setting for frequent words
seed = 1
# Train the model (this will take some time)
print ("Training model...")
Game_of_thrones_2_vec = w2v.Word2Vec(sentences, 
                                     workers=num_workers, 
                                     size=num_features, 
                                     min_count = min_word_count, 
                                     window = context, 
                                     sample = downsampling,
                                     seed = seed
                                    )

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
Game_of_thrones_2_vec.init_sims(replace=True)


2017-08-08 22:44:53,232 : INFO : collecting all words and their counts
2017-08-08 22:44:53,234 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-08-08 22:44:53,297 : INFO : PROGRESS: at sentence #10000, processed 140984 words, keeping 10280 word types
2017-08-08 22:44:53,343 : INFO : PROGRESS: at sentence #20000, processed 279730 words, keeping 13558 word types
2017-08-08 22:44:53,402 : INFO : PROGRESS: at sentence #30000, processed 420336 words, keeping 16598 word types


Training model...


2017-08-08 22:44:53,458 : INFO : PROGRESS: at sentence #40000, processed 556581 words, keeping 18324 word types
2017-08-08 22:44:53,515 : INFO : PROGRESS: at sentence #50000, processed 686247 words, keeping 19714 word types
2017-08-08 22:44:53,572 : INFO : PROGRESS: at sentence #60000, processed 828497 words, keeping 21672 word types
2017-08-08 22:44:53,633 : INFO : PROGRESS: at sentence #70000, processed 973830 words, keeping 23093 word types
2017-08-08 22:44:53,690 : INFO : PROGRESS: at sentence #80000, processed 1114967 words, keeping 24252 word types
2017-08-08 22:44:53,748 : INFO : PROGRESS: at sentence #90000, processed 1260481 words, keeping 26007 word types
2017-08-08 22:44:53,798 : INFO : PROGRESS: at sentence #100000, processed 1393203 words, keeping 26884 word types
2017-08-08 22:44:53,857 : INFO : PROGRESS: at sentence #110000, processed 1532150 words, keeping 27809 word types
2017-08-08 22:44:53,918 : INFO : PROGRESS: at sentence #120000, processed 1680961 words, keeping 2

In [12]:
print("Word2Vec vocabulary length:", len(Game_of_thrones_2_vec.wv.vocab))
print("Word2Vec word vector length:",Game_of_thrones_2_vec["Stark"].size)

Word2Vec vocabulary length: 5996
Word2Vec word vector length: 300


In [13]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [14]:
Game_of_thrones_2_vec.save(os.path.join("trained", "Game_of_thrones_2_vec.w2v"))

2017-08-08 22:45:38,651 : INFO : saving Word2Vec object under trained/Game_of_thrones_2_vec.w2v, separately None
2017-08-08 22:45:38,654 : INFO : not storing attribute syn0norm
2017-08-08 22:45:38,656 : INFO : not storing attribute cum_table
2017-08-08 22:45:38,827 : INFO : saved trained/Game_of_thrones_2_vec.w2v


In [15]:
Game_of_thrones_2_vec = w2v.Word2Vec.load(os.path.join("trained", "Game_of_thrones_2_vec.w2v"))

2017-08-08 22:45:40,200 : INFO : loading Word2Vec object from trained/Game_of_thrones_2_vec.w2v
2017-08-08 22:45:40,342 : INFO : loading wv recursively from trained/Game_of_thrones_2_vec.w2v.wv.* with mmap=None
2017-08-08 22:45:40,343 : INFO : setting ignored attribute syn0norm to None
2017-08-08 22:45:40,344 : INFO : setting ignored attribute cum_table to None
2017-08-08 22:45:40,345 : INFO : loaded trained/Game_of_thrones_2_vec.w2v


**Visualization Word2Vec model **

In [16]:

#tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)  
#
#all_word_vectors_matrix = Game_of_thrones_2_vec.wv.syn0
#
#all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)
#
#points = pd.DataFrame(
#    [
#        (word, coords[0], coords[1])
#        for word, coords in [
#            (word, all_word_vectors_matrix_2d[Game_of_thrones_2_vec.wv.vocab[word].index])
#            for word in Game_of_thrones_2_vec.wv.vocab
#        ]
#    ],
#    columns=["word", "x", "y"]
#)
#
#print(points.head(10))
#sns.set_context("poster")
#points.plot.scatter("x", "y", s=10, figsize=(20, 12))
#
#def plot_region(x_bounds, y_bounds):
#    slice = points[
#        (x_bounds[0] <= points.x) &
#        (points.x <= x_bounds[1]) &
#        (y_bounds[0] <= points.y) &
#        (points.y <= y_bounds[1]) 
#    ]
#    
#    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
#    for i, points in slice.iterrows():
#        ax.text(points.x + 0.005, points.y + 0.005, point.word, fontsize=11)
#
#plot_region(x_bounds = (4.0, 4.2), y_bounds = (-0.5, -0.1))
#plot_region(x_bounds = (0, 1), y_bounds = (4, 4.5))


In [17]:
Game_of_thrones_2_vec.most_similar("Aerys")

2017-08-08 22:45:43,621 : INFO : precomputing L2-norms of word weight vectors


[('Mad', 0.8702609539031982),
 ('Rhaegar', 0.859840989112854),
 ('Aegon', 0.8517005443572998),
 ('Robert', 0.8361358046531677),
 ('Daeron', 0.8048063516616821),
 ('Jaehaerys', 0.7642648220062256),
 ('Justice', 0.7641534805297852),
 ('Hand', 0.7634960412979126),
 ('Doran', 0.7401446104049683),
 ('North', 0.737543523311615)]

In [18]:
Game_of_thrones_2_vec.most_similar("Stark")

[('Arryn', 0.7937906384468079),
 ('Brandon', 0.7730824947357178),
 ('Lady', 0.751340389251709),
 ('murdered', 0.7447934150695801),
 ('bastard', 0.6961814761161804),
 ('Winterfell', 0.6904762983322144),
 ('Robb', 0.6817637085914612),
 ('Tully', 0.6800894141197205),
 ('Young', 0.6777874231338501),
 ('Roose', 0.6703848242759705)]