In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
from argparse import Namespace
import copy
import gensim
from gensim.models import Word2Vec
import json
import nltk; nltk.download('punkt')
import numpy as np
import pandas as pd
import re
import urllib

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mhisf\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [12]:
args = Namespace(
    seed=1234,
    data_file="language\\movie_corpus.txt",
    embedding_dim=100,
    window=5,
    min_count=3,
    skip_gram=1, # 0 = CBOW
    negative_sampling=20,
)

In [40]:
# Split text into sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
with open(args.data_file, encoding='utf8') as fp:
    book = fp.read()
sentences = tokenizer.tokenize(book)
print (len(sentences))
print (sentences[11])

100013
However, as this is a comedy, the duo manage to make the most round about and stupid escape--accidentally boarding a train to Constantinople to be placed in a Turkish prisoner of war camp!


In [41]:
# Preprocessing
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    text = text.strip()
    return text

In [42]:
# Clean sentences
sentences = [preprocess_text(sentence) for sentence in sentences]
print (sentences[11])

however , as this is a comedy , the duo manage to make the most round about and stupid escape accidentally boarding a train to constantinople to be placed in a turkish prisoner of war camp !


In [43]:
# Process sentences for gensim
sentences = [sentence.split(" ") for sentence in sentences]
print (sentences[11])

['however', ',', 'as', 'this', 'is', 'a', 'comedy', ',', 'the', 'duo', 'manage', 'to', 'make', 'the', 'most', 'round', 'about', 'and', 'stupid', 'escape', 'accidentally', 'boarding', 'a', 'train', 'to', 'constantinople', 'to', 'be', 'placed', 'in', 'a', 'turkish', 'prisoner', 'of', 'war', 'camp', '!']


In [44]:
(sentences[10111])

['the',
 'storyline',
 'of',
 'this',
 'game',
 'is',
 ',',
 'i',
 'think',
 'one',
 'of',
 'the',
 'greatest',
 'from',
 'the',
 'final',
 'fantasy',
 'series',
 'and',
 'the',
 'graphics',
 'for',
 'a',
 'ps',
 'video',
 'game',
 ',',
 'well',
 'it',
 'is',
 'just',
 'amazing',
 '.']

In [45]:
# Train Word2Vec model with sentences
model = Word2Vec(sentences=sentences, size=args.embedding_dim, 
                 window=args.window, min_count=args.min_count, 
                 sg=args.skip_gram, negative=args.negative_sampling)
print (model)

Word2Vec(vocab=21750, size=100, alpha=0.025)


In [60]:
model.wv.most_similar(positive="ok", topn=5)

[('okay', 0.8580403923988342),
 ('alright', 0.8139011859893799),
 ('anyways', 0.770700216293335),
 ('horrid', 0.7692903280258179),
 ('dubbing', 0.768156886100769)]

In [None]:
model.save('language.w2v.model')