In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
# import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
# note I am using nltk to tokenize because current version of spacy requires too much memory locally
from nltk.tokenize import sent_tokenize, word_tokenize 
import gensim
import string
from gensim.models import word2vec

In [2]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    # Remove all punctuation except "."?
    x = string.punctuation
    y = x.replace('.','')
    for c in y:     
        text = text.replace(c,"")
    
    return text

# Import all the Austen in the Project Gutenberg corpus.
austen = ""
for novel in ['persuasion','emma','sense']:
    work = gutenberg.raw('austen-' + novel + '.txt')
    austen = austen + work

# Clean the data.
austen_clean = text_cleaner(austen)

In [3]:
# Parse the data with nltk tokenize. 
austen_doc = sent_tokenize(austen_clean)

In [4]:
sentences = [word_tokenize(t) for t in austen_doc]

In [5]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

done!


In [6]:
# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('loud', 'aloud'))
#print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.wv.doesnt_match("breakfast marriage dinner lunch".split()))

[('ladies', 0.7482085227966309), ('men', 0.6697049736976624), ('gentleman', 0.6585294008255005), ('blessing', 0.638296902179718), ('connection', 0.6382190585136414), ('fine', 0.6329874992370605), ('difference', 0.6245848536491394), ('person', 0.6143720746040344), ('navy', 0.6062247157096863), ('women', 0.6037020683288574)]
0.295157545441
marriage


In [7]:
# Drill 0
# Play with hyper-parameters
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=8,      # Number of words around target word to consider.
    sg=1,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

done!


In [8]:
# Drill 0, continued
# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('loud', 'aloud'))
# print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.wv.doesnt_match("breakfast marriage dinner lunch".split()))

[('bride', 0.4652150869369507), ('joke', 0.46002674102783203), ('ladies', 0.4542887806892395), ('stupid', 0.4456436038017273), ('gentleman', 0.44243288040161133), ('gentlemanlike', 0.442058801651001), ('respectable', 0.4414021372795105), ('gallant', 0.43901610374450684), ('unaffected', 0.43808484077453613), ('bye', 0.43779513239860535)]
0.480970324093
marriage


In [9]:
# Drill 1 done via interactive web app due to local memory considerations:
# https://rare-technologies.com/word2vec-tutorial/#bonus_app