
[Word2Vec and WMD Tutorial](https://markroxor.github.io/gensim/static/notebooks/WMD_tutorial.html) 

[Finding Similar Quora Questions with Word2Vec and Xgboost](https://towardsdatascience.com/finding-similar-quora-questions-with-word2vec-and-xgboost-1a19ad272c0d)

[aparrish/understanding-word-vectors.ipynb](https://gist.github.com/aparrish/2f562e3737544cf29aaf1af30362f469)

In [1]:
# Datasets

DATA_FILE = '../data/emnlp_news.txt'
TEST_FILE = '../data/test_emnlp.txt'
GENERATED_FILE = '../data/generated_text2.txt'

In [2]:
# Import and download stopwords from NLTK.
import nltk
from nltk.corpus import stopwords
from nltk import download
download('stopwords')  # Download stopwords list.

[nltk_data] Downloading package stopwords to /Users/dc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def get_sentences(filepath: str) -> list:
    """ Return sentences given a text file.
        The sentences will be tokenized in this function.
    """
    with open(filepath, mode='r', encoding="ISO-8859-1") as f:
        data = f.read()
    sentences = nltk.sent_tokenize(data)
    return sentences



def preprocess(sentences: list) -> list:
    """ Take a list of sentences and return a list of list of words
    where all the words are alphabetic and not a stop word
    """
    stop_words = stopwords.words("english")

    preprocessed = list()
    for sentence in sentences:
        words = sentence.split()
        processed_words = [w for w in words if w.isalpha() and (not w in stop_words)]
        preprocessed.append(processed_words)
    return preprocessed

preprocess(get_sentences(GENERATED_FILE)[:2])

[['racism',
  'enjoy',
  'players',
  'war',
  'price',
  'whose',
  'wife',
  'taken',
  'public',
  'office',
  'cleveland'],
 ['time',
  'followed',
  'years',
  'dialogue',
  'year',
  'possibly',
  'easy',
  'candidate',
  'granted',
  'level',
  'online']]

In [4]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

## Cleaning:
We are lemmatizing and removing the stopwords and non-alphabetic characters for each line of dialogue.

In [5]:
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [6]:
def brief_cleaning(sentences):
    return (re.sub("[^A-Za-z']+", ' ', str(row)) for row in sentences)

In [46]:
t = time()

GENERATED = get_sentences(GENERATED_FILE)

txt = preprocess(GENERATED)

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.01 mins


In [47]:
print(txt)



In [48]:

sentences = txt

word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

print(word_freq)



In [49]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['said',
 'people',
 'one',
 'time',
 'year',
 'would',
 'know',
 'get',
 'trump',
 'think']

In [43]:
import os
import gensim

# Using pre-trained word2vec Google News corpus (https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit)
if not os.path.exists('../data/w2v_googlenews/GoogleNews-vectors-negative300.bin.gz'):
    raise ValueError("SKIP: You need to download the google news model")
    
preloaded_model = gensim.models.KeyedVectors.load_word2vec_format('../data/w2v_googlenews/GoogleNews-vectors-negative300.bin.gz', binary=True)

preloaded_model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.


INFO - 10:24:02: loading projection weights from ../data/w2v_googlenews/GoogleNews-vectors-negative300.bin.gz
INFO - 10:26:03: loaded (3000000, 300) matrix from ../data/w2v_googlenews/GoogleNews-vectors-negative300.bin.gz
INFO - 10:26:03: precomputing L2-norms of word weight vectors


# Training the model
## Gensim Word2Vec Implementation:
We use Gensim implementation of word2vec: https://radimrehurek.com/gensim/models/word2vec.html

In [50]:
import multiprocessing

from gensim.models import Word2Vec

In [51]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
print(cores)

4


In [56]:
# Plot histogram detailing length of sentences in a corpus

from matplotlib import pyplot as plt
%matplotlib inline

def plot_hist(corpus):
    # Document lengths.
    lens = [len(sentence.split()) for sentence in corpus] # find num of words in a sentence

    # Plot.
#     plt.rc('figure', figsize=(8,6))
#     plt.rc('font', size=14)
#     plt.rc('lines', linewidth=2)
#     plt.rc('axes', prop_cycle=('#377eb8','#e41a1c','#4daf4a',
#                                 '#984ea3','#ff7f00','#ffff33'))
    # Histogram.
    plt.hist(lens, bins=50)
    # Average length.
    avg_len = sum(lens) / float(len(lens))
#     plt.axvline(avg_len, color='#e41a1c')
    plt.title('Histogram of document lengths.')
    plt.xlabel('Length')
    plt.text(100, 800, 'mean = %.2f' % avg_len)
    plt.show()

In [52]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

## Building the Vocabulary Table:
Word2Vec requires us to build the vocabulary table (simply digesting all the words and filtering out the unique words, and doing some basic counts on them):

In [53]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 10:35:47: collecting all words and their counts
INFO - 10:35:47: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 10:35:47: PROGRESS: at sentence #10000, processed 107862 words, keeping 4743 word types
INFO - 10:35:47: collected 4764 word types from a corpus of 118994 raw words and 11055 sentences
INFO - 10:35:47: Loading a fresh vocabulary
INFO - 10:35:47: min_count=20 retains 1361 unique words (28% of original 4764, drops 3403)
INFO - 10:35:47: min_count=20 leaves 93664 word corpus (78% of original 118994, drops 25330)
INFO - 10:35:47: deleting the raw counts dictionary of 4764 items
INFO - 10:35:47: sample=6e-05 downsamples 1361 most-common words
INFO - 10:35:47: downsampling leaves estimated 31740 word corpus (33.9% of prior 93664)
INFO - 10:35:47: estimated required memory for 1361 words and 300 dimensions: 3946900 bytes
INFO - 10:35:47: resetting layer weights


Time to build vocab: 0.0 mins


## Training of the model:
_Parameters of the training:_
* `total_examples` <font color='purple'>=</font> <font color='green'>int</font> - Count of sentences;
* `epochs` <font color='purple'>=</font> <font color='green'>int</font> - Number of iterations (epochs) over the corpus - [10, 20, 30]

In [54]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 10:36:33: training model with 3 workers on 1361 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 10:36:34: worker thread finished; awaiting finish of 2 more threads
INFO - 10:36:34: worker thread finished; awaiting finish of 1 more threads
INFO - 10:36:34: worker thread finished; awaiting finish of 0 more threads
INFO - 10:36:34: EPOCH - 1 : training on 118994 raw words (31499 effective words) took 0.3s, 123492 effective words/s
INFO - 10:36:34: worker thread finished; awaiting finish of 2 more threads
INFO - 10:36:34: worker thread finished; awaiting finish of 1 more threads
INFO - 10:36:34: worker thread finished; awaiting finish of 0 more threads
INFO - 10:36:34: EPOCH - 2 : training on 118994 raw words (31473 effective words) took 0.1s, 329385 effective words/s
INFO - 10:36:34: worker thread finished; awaiting finish of 2 more threads
INFO - 10:36:34: worker thread finished; awaiting finish of 1 more threads
INFO - 10:36:34: worker thread

INFO - 10:36:36: EPOCH - 24 : training on 118994 raw words (31730 effective words) took 0.1s, 282881 effective words/s
INFO - 10:36:36: worker thread finished; awaiting finish of 2 more threads
INFO - 10:36:36: worker thread finished; awaiting finish of 1 more threads
INFO - 10:36:36: worker thread finished; awaiting finish of 0 more threads
INFO - 10:36:36: EPOCH - 25 : training on 118994 raw words (32006 effective words) took 0.1s, 264975 effective words/s
INFO - 10:36:36: worker thread finished; awaiting finish of 2 more threads
INFO - 10:36:36: worker thread finished; awaiting finish of 1 more threads
INFO - 10:36:36: worker thread finished; awaiting finish of 0 more threads
INFO - 10:36:36: EPOCH - 26 : training on 118994 raw words (31638 effective words) took 0.1s, 317244 effective words/s
INFO - 10:36:36: worker thread finished; awaiting finish of 2 more threads
INFO - 10:36:36: worker thread finished; awaiting finish of 1 more threads
INFO - 10:36:36: worker thread finished; aw

Time to train the model: 0.06 mins


As we do not plan to train the model any further, we are calling init_sims(), which will make the model much more memory-efficient:

In [55]:
w2v_model.init_sims(replace=True)

INFO - 10:37:22: precomputing L2-norms of word weight vectors


In [44]:
print(model.wmdistance("This is not good what is going on in this".split(), "I don't like you".split()))

#wmdistance requires the sentences to be split to lists

1.159184576442361


Word2Vec(vocab=59, size=100, alpha=0.025)


In [20]:
from gensim.test.utils import common_texts
print(common_texts[:10])

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


### Get similar sentences in the generated text set

In [59]:
# DON'T RUN AGAIN

from gensim.similarities import WmdSimilarity

def process_corpus(corpus):
    return preprocess(corpus)

processed_corpus = sentences
print(processed_corpus[:10])

t = time()


instance = WmdSimilarity(processed_corpus, preloaded_model, num_best=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))


INFO - 11:09:19: precomputing L2-norms of word weight vectors


[['racism', 'enjoy', 'players', 'war', 'price', 'whose', 'wife', 'taken', 'public', 'office', 'cleveland'], ['time', 'followed', 'years', 'dialogue', 'year', 'possibly', 'easy', 'candidate', 'granted', 'level', 'online'], ['star', 'shares', 'suggests', 'increases', 'mr'], ['ryan', 'syrian', 'civil', 'war', 'personally', 'likely', 'bond', 'americans', 'suffering', 'scotland', 'entered', 'investment', 'would', 'give', 'rangers', 'order', 'uk', 'say', 'united', 'states', 'hard', 'mail', 'afghan', 'politics', 'hit', 'media', 'based', 'asking', 'people', 'able', 'follow', 'second', 'beloved', 'team', 'choose', 'middle', 'risk', 'program', 'much', 'others', 'added'], ['lead', 'hope', 'sure', 'explore', 'ball', 'comes', 'poverty', 'matter', 'use'], ['russians', 'students', 'help', 'like', 'terror', 'campaign', 'steady', 'chance', 'case'], ['studying', 'manchester', 'city', 'know', 'certain', 'people', 'rarely', 'believe', 'mind'], ['bbc', 'would', 'forever', 'continue', 'chance', 'cut', 'vict

In [59]:
from time import time
start = time()

# documents = get_sentences(GENERATED_FILE)[:10]


sent = 'Very good, you should seat outdoor.'
query = sent.split()

sims = instance[query]  # A query is simply a "look-up" in the similarity class.

print ('Cell took %.2f seconds to run.' %(time() - start))

Cell took 0.01 seconds to run.


In [85]:
# Print the query and the retrieved documents, together with their similarities.

from tqdm import tqdm

print(len(processed_corpus))
dists = list()

for sent in tqdm(processed_corpus[:4]):
    print(sent)
    sims = instance[sent]
    print(sims)
    
    sim = sims[0][1]
    dist = 1-sim
    
    dists.append(dist)
#     print ('='*20)
#     print ('Query:')
#     print (sent)
#     for i in range(1):
#         print ()
#         print ('sim = %.4f' % sims[i][1])
#         print (documents[sims[i][0]])

print(dists)


  0%|          | 0/4 [00:00<?, ?it/s]

11055
['racism', 'enjoy', 'players', 'war', 'price', 'whose', 'wife', 'taken', 'public', 'office', 'cleveland']


 25%|██▌       | 1/4 [00:22<01:07, 22.44s/it]

[(0, 1.0)]
['time', 'followed', 'years', 'dialogue', 'year', 'possibly', 'easy', 'candidate', 'granted', 'level', 'online']


 50%|█████     | 2/4 [00:44<00:44, 22.31s/it]

[(1, 1.0)]
['star', 'shares', 'suggests', 'increases', 'mr']


 75%|███████▌  | 3/4 [00:56<00:19, 19.14s/it]

[(2, 1.0)]
['ryan', 'syrian', 'civil', 'war', 'personally', 'likely', 'bond', 'americans', 'suffering', 'scotland', 'entered', 'investment', 'would', 'give', 'rangers', 'order', 'uk', 'say', 'united', 'states', 'hard', 'mail', 'afghan', 'politics', 'hit', 'media', 'based', 'asking', 'people', 'able', 'follow', 'second', 'beloved', 'team', 'choose', 'middle', 'risk', 'program', 'much', 'others', 'added']


100%|██████████| 4/4 [02:22<00:00, 39.31s/it]

[(3, 1.0)]
[0.0, 0.0, 0.0, 0.0]





In [89]:
from tqdm import tqdm

print(len(processed_corpus))
dists = list()
import numpy as np

def diversity(sentence, tokenized_sentences) -> float:
    """ Calculate the diversity of sentence compared with a given corpus/document.
    """
    # sentences = nltk.sent_tokenize(document)

    min_edit_distance = np.inf
    for ref_sentence in tokenized_sentences:
        if sentence != ref_sentence:
            edit_distance = model.wmdistance(sentence, ref_sentence)

            if edit_distance < min_edit_distance:
                min_edit_distance = edit_distance
                # maximum similarity is minimum edit distance
                max_sim = min_edit_distance 

    return 1 - max_sim
    
divs = list()
for sent in tqdm(processed_corpus[:4]):
    div = diversity(sent, processed_corpus)
    divs.append(div)

  0%|          | 0/4 [00:00<?, ?it/s]

11055


100%|██████████| 4/4 [02:21<00:00, 38.81s/it]


In [90]:
print(divs)

[-0.003816038240816777, 0.005471795418556469, -0.014274552001953156, -0.06668842146369025]
