Code to calculate average Word Movers Distance between adjacent sentences in a text file. Uses word2vec pretrained Google News embeddings, available from https://github.com/mmihaltz/word2vec-GoogleNews-vectors

In [1]:
from pyemd import emd
from gensim.similarities import WmdSimilarity
from gensim.models import KeyedVectors
from gensim.test.utils import get_tmpfile
from nltk import word_tokenize
from nltk.corpus import stopwords
import numpy
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from scipy import spatial
import numpy as np
from pandas import DataFrame, Series, ExcelWriter
import more_itertools
import os
import glob
from nltk.corpus import wordnet as wn
from nltk import sent_tokenize
from nltk import pos_tag
import pandas as pd
stopWords = stopwords.words('english')

In [2]:
path = ""

In [3]:
filename = 'GoogleNews-vectors-negative300.bin.gz'

In [4]:
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [None]:
# play around with vectors

model.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

In [67]:
text1 = ''''''

In [68]:
text2 = ''''''

In [69]:
#calculate distance between two sentences using WMD algorithm
distance = model.wmdistance(text1, text2)
print ('distance = %.3f' % distance)

In [None]:
# sentence to sentence word movers distance. if sentence has no words in vocab, returns infinity

tags = ['FW','JJ','JJR','JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR','RBS','VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
stopWords = stopwords.words('english')

files_list = []
mean_wmd_list = []
sd_wmd_list = []
median_wmd_list = []

for file in os.listdir(path):       #for every file in the path
    if file.endswith("*.txt"):       #specify certain files
        files_list.append(file)     #save the file's name in the list_files list
        file_encoding = 'utf8'
        with open(file, encoding=file_encoding, errors = 'ignore') as f:
            text = f.read() #reads over each line of the file
            sentences = sent_tokenize(text)
            new_sent = []
            wmds = []
            for sentence in sentences:
                tokens = word_tokenize(sentence)
                tagged = pos_tag(tokens)
                content = [(word, tag) for (word, tag) in tagged if tag in tags]
                content2 = [(word, tag) for (word, tag) in content if word.isalpha()] #restricts string to alphabetic characters only
                content3 = [word for (word, tag) in content2 if word not in stopWords]
                lemmas = [getLemma(word) for word in content3]
                sent = ' '.join(lemmas)
                new_sent.append(sent)
                
            for v, w in zip(new_sent[:-1],new_sent[1:]):
                distance = model.wmdistance(v, w)
                wmds.append(distance)
                
            wmds2 = [x for x in wmds if ~np.isinf(x)]
                
            mean_wmd = np.mean(wmds2)
            sd_wmd = np.std(wmds2)
            median_wmd = np.median(wmds2)
            
            mean_wmd_list.append(mean_wmd)
            sd_wmd_list.append(sd_wmd)
            median_wmd_list.append(median_wmd)

In [None]:
#check distribution
plt.hist(wmds2)

In [None]:
# for files with one sentence, splits in half and does wmd between each half

tags = ['FW','JJ','JJR','JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR','RBS','VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
stopWords = stopwords.words('english')

files_list = []
mean_wmd_list = []
sd_wmd_list = []
median_wmd_list = []

for file in os.listdir(path):       #for every file in the path
    if file.endswith("*.txt"):  # specify certain files
        files_list.append(file)
        file_encoding = 'utf8'
        new_sent = []
        new_sent2 = []
        wmds = []
        with open(file, encoding=file_encoding, errors = 'ignore') as f:
            text = f.read() #reads over each line of the file
            sentences = sent_tokenize(text)
            sentence_length = len(sentences)
            if sentence_length <2:
                for sentence in sentences:
                    firstpart, secondpart = word_tokenize(sentence)[:len(word_tokenize(sentence))//2], word_tokenize(sentence)[len(word_tokenize(sentence))//2:]
                    new_sent.append(' '.join(firstpart))
                    new_sent.append(' '.join(secondpart))
                    
            for sentence in new_sent:
                tokens = word_tokenize(sentence)
                tagged = pos_tag(tokens)
                content = [(word, tag) for (word, tag) in tagged if tag in tags]
                content2 = [(word, tag) for (word, tag) in content if word.isalpha()] #restricts string to alphabetic characters only
                content3 = [word for (word, tag) in content2 if word not in stopWords]
                lemmas = [getLemma(word) for word in content3]
                sent = ' '.join(lemmas)
                new_sent2.append(sent)
                
            
            for v, w in zip(new_sent2[:-1],new_sent2[1:]):
                distance = model.wmdistance(v, w)
                wmds.append(distance)

            wmds2 = [x for x in wmds if ~np.isinf(x)]

            mean_wmd = np.mean(wmds2)
            sd_wmd = np.std(wmds2)
            median_wmd = np.median(wmds2)

            mean_wmd_list.append(mean_wmd)
            sd_wmd_list.append(sd_wmd)
            median_wmd_list.append(median_wmd)

In [6]:
def get_tokens(text):
    '''
    takes input of text string, preprocesses and returns list of tokens
    '''
    
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopWords]
    tokens = [word for word in tokens if word.isalpha()] #restricts string to alphabetic characters only
    #tokens = [getLemma(token) for token in tokens]
 
    return tokens

In [7]:
def getLemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma