Code for calculating cosine similarity between sentences and moving windows. Uses word2vec pretrained Google News embeddings, available from https://github.com/mmihaltz/word2vec-GoogleNews-vectors

In [1]:
from pyemd import emd
from gensim.similarities import WmdSimilarity
from gensim.models import KeyedVectors
from gensim.test.utils import get_tmpfile
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
import numpy
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from scipy import spatial
import numpy as np
from pandas import DataFrame, Series, ExcelWriter
import itertools
import more_itertools
import os
import glob
from sklearn.decomposition import PCA
import pandas as pd
from nltk import pos_tag
from nltk.corpus import wordnet as wn
import pickle
stopWords = stopwords.words('english')

In [2]:
path = ""

In [3]:
filename = 'GoogleNews-vectors-negative300.bin.gz'

In [4]:
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [None]:
# play around with vectors
model.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

In [None]:
''' take all text files in a folder and calculate average sentence vector, then cosine between each subsequent 
sentence, mean, median and sd
'''
tags = ['FW','JJ','JJR','JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR','RBS','VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
stopWords = stopwords.words('english')

files_list = []
mean_cosine_list = []
sd_cosine_list = []
median_cosine_list = []

for file in os.listdir(path):       #for every file in the path
    if file.endswith("*.txt"):       #specify certain files
        files_list.append(file)     #save the file's name in the list_files list
        file_encoding = 'utf8'
        sentence_vectors = []
        cosines = []
        with open(file, encoding=file_encoding, errors = 'ignore') as f:
            text = f.read() #reads over each line of the file
            sentences = sent_tokenize(text)
            for sentence in sentences:
                tokens = word_tokenize(sentence)
                tagged = pos_tag(tokens)
                content = [(word, tag) for (word, tag) in tagged if tag in tags]
                content2 = [(word, tag) for (word, tag) in content if word.isalpha()] #restricts string to alphabetic characters only
                content3 = [word for (word, tag) in content2 if word not in stopWords]
                lemmas = [getLemma(word) for word in content3]
                
                sentence_vec = win_avg_feature_vector(lemmas, model=model, num_features=300, index2word_set=index2word_set)
                sentence_vectors.append(sentence_vec)
                 
            for v, w in zip(sentence_vectors[:-1],sentence_vectors[1:]):
                cos = 1 - spatial.distance.cosine(v, w)
                cosines.append(cos)
           
            cosines2 = [x for x in cosines if ~np.isnan(x)]
                
            mean_cosine = np.mean(cosines2)
            sd_cosine = np.std(cosines2)
            median_cosine = np.median(cosines2)
            
            mean_cosine_list.append(mean_cosine)
            sd_cosine_list.append(sd_cosine)
            median_cosine_list.append(median_cosine)
        

In [None]:
#can check distribution
plt.hist(cosines2)

In [23]:
'''for files only one sentence long, split sentence in half and calculate cosine between each half
'''
tags = ['FW','JJ','JJR','JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR','RBS','VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
stopWords = stopwords.words('english')
files_list = []
mean_cosine_list = []
sd_cosine_list = []
median_cosine_list = []

for file in os.listdir(path):       #for every file in the path
    if file.endswith("*.txt"):       #specify certain files
        file_encoding = 'utf8'
        files_list.append(file)
        sentence_vectors = []
        cosines = []
        new_sent = []
        with open(file, encoding=file_encoding, errors = 'ignore') as f:
            text = f.read() #reads over each line of the file
            sentences = sent_tokenize(text)
            sentence_length = len(sentences)
            if sentence_length <2:
                for sentence in sentences:
                    firstpart, secondpart = word_tokenize(sentence)[:len(word_tokenize(sentence))//2], word_tokenize(sentence)[len(word_tokenize(sentence))//2:]
                    new_sent.append(' '.join(firstpart))
                    new_sent.append(' '.join(secondpart))
            
            for sentence in new_sent:
                tokens = word_tokenize(sentence)
                tagged = pos_tag(tokens)
                content = [(word, tag) for (word, tag) in tagged if tag in tags]
                content2 = [(word, tag) for (word, tag) in content if word.isalpha()] #restricts string to alphabetic characters only
                content3 = [word for (word, tag) in content2 if word not in stopWords]
                lemmas = [getLemma(word) for word in content3]
                
                sentence_vec = win_avg_feature_vector(lemmas, model=model, num_features=300, index2word_set=index2word_set)
                sentence_vectors.append(sentence_vec)
                
            for v, w in zip(sentence_vectors[:-1],sentence_vectors[1:]):
                cos = 1 - spatial.distance.cosine(v, w)
                cosines.append(cos)
                
            cosines2 = [x for x in cosines if ~np.isnan(x)]
            
    
            mean_cosine = np.mean(cosines2)
            sd_cosine = np.std(cosines2)
            median_cosine = np.median(cosines2)
            
            mean_cosine_list.append(mean_cosine)
            sd_cosine_list.append(sd_cosine)
            median_cosine_list.append(median_cosine)
            


In [21]:
# window to window average vectors and cosine

tags = ['FW','JJ','JJR','JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR','RBS','VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
stopWords = stopwords.words('english')

files_list = []
mean_cosine_list = []
sd_cosine_list = []
median_cosine_list = []

for file in os.listdir(path):       #for every file in the path
    if file.endswith("*.txt"):       #specify certain files
        files_list.append(file)
        file_encoding = 'utf8'
        window_vectors = []
        cosines = []
        with open(file, encoding=file_encoding, errors = 'ignore') as f:
            text = f.read() #reads over each line of the file
            tokens = get_tokens(text)
            tagged = pos_tag(tokens)
            content = [(word, tag) for (word, tag) in tagged if tag in tags]
            content2 = [(word, tag) for (word, tag) in content if word.isalpha()] #restricts string to alphabetic characters only
            content3 = [word for (word, tag) in content2 if word not in stopWords]
            lemmas = [getLemma(word) for word in content3]
            
            windows = winderise(lemmas, 2, 1) # set window and step size here
            
            for window in windows:
                window_vec = win_avg_feature_vector(window, model=model, num_features=300, index2word_set=index2word_set)
                window_vectors.append(window_vec)

            for v, w in zip(window_vectors[:-1],window_vectors[1:]):
                cos = 1 - spatial.distance.cosine(v, w)
                cosines.append(cos)
                   
            cosines2 = [x for x in cosines if ~np.isnan(x)]

            mean_cosine = np.mean(cosines2)
            sd_cosine = np.std(cosines2)
            median_cosine = np.median(cosines2)

            mean_cosine_list.append(mean_cosine)
            sd_cosine_list.append(sd_cosine)
            median_cosine_list.append(median_cosine)
            
            

In [None]:
# gensim built in average vector and distance metric, but doesn't handle words not in vocab

tokens1 = get_tokens(sent1)
tokens2 = get_tokens(sent2)
distance = model.wv.n_similarity(v, w)

In [5]:
def winderise(text, window=10, step=1):
    '''
    move a sliding window over a list. entering window and/ or step size overrides function values
    '''
    windows = more_itertools.windowed(text, window, step=step)
    for window in windows:
        yield window
    

In [7]:
def get_tokens(text):
    '''
    takes input of text string, preprocesses and returns list of tokens
    '''
    #textData = []
    
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopWords]
    tokens = [word for word in tokens if word.isalpha()] #restricts string to alphabetic characters only
    #tokens = [getLemma(token) for token in tokens] not  - not sure if I should lemmatize?
    #textData.append(tokens)
    #textLength = len(tokens) #calculates total number of words in the file
    return tokens

In [8]:
def window_vector_sums(text):
    '''
    returns a list which contains the summed vectors for each window. Calls winderise and vector_sum functions
    '''
    window_sums = []
    windows = winderise(text, 10) # takes the input text and returns sliding windows of required size
    for text in windows:  # for the text in each window
        window_vectors = get_vectors(text)  # call the get_vectors function to get the vector for each word
        window_sum = vector_sum(window_vectors)  # call the vector_sum function to sum all vectors in the window 
        window_sums.append(window_sum)  # add the summed vector to a list
        
    return window_sums  

In [9]:
def get_cosine(summed_vectors):
    '''
    returns a list which is the cosine similarity between each subsequent window
    '''
    cosines = []
    for v, w in zip(summed_vectors[:-1],summed_vectors[1:]):
        cos = 1 - spatial.distance.cosine(v, w)
        cosines.append(cos)
    
    return cosines

In [11]:
def get_vectors(text):
    '''
    finds the vector for each word in the text and adds it to a list
    '''
    text_vectors = []
    for word in text:
        if word in model.wv.vocab:
            vector = model.wv[word]
            text_vectors.append(vector)
        
    return text_vectors

In [12]:
def vector_sum(vectors):
    '''
    given a list of vectors e.g. for window or sentence, return the sum of all vectors
    '''
    n = len(vectors)
    d = len(vectors[0])

    #create an array initialized to 0 of the same length of the word vectors
    s = []
    for i in range(d):
        s.append(0)
    s = np.array(s)

    #add each word vector to the zero vector
    for vector in vectors:
        s = s + np.array(vector)

    return list(s)

In [16]:
def getLemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma