# Local Homology NLP Use Cases: Unsupervised text disambiguation

In this tutorial apply local homology to study natural language processing data. 

In [None]:
from gtda.local_homology.simplicial import *
import numpy as np

Recent research have been looking at the the role of local homology in Natural Language Processing, and particularly to the task of text dissamgibuating'. Here we showcase a method that can be useful to distinguish occurences of the word "note" when referring to a musical "note", versus when used as a word referring to text.

In [None]:
# Import needed libraries
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
from gensim.parsing.preprocessing import remove_stopwords, stem

In [None]:
# Preprocess the data
f = open("data/note.n.xml","r")
content = f.read()
temp_list = content.split("<note.n.")
list_of_text = list(map(remove_stopwords,temp_list)) # remove stopwords
list_of_text = list(map(stem, temp_list)) # make lower case
refined_list = [list_of_text[i][1+len(str(i)):-12 - len(str(i))] for i in range(2,len(list_of_text))]

In [None]:
# In order to interpret the persistence diagram, we introduce some helper functions:
from gtda.diagrams.features import PersistenceEntropy

def modified_persistence_entropy(diags):
    """ This is a custom vectorizer, similar to functions
    in gtda.diagrams.features. Inputs a sequence of persistence
    diagrams, and outputs a sequence of vectors"""
    return 2**PersistenceEntropy().fit_transform(diags)

In [None]:
# extract sentences and vectorize them
all_words_in_sentences = list(map(str.split,refined_list))
word2vec = Word2Vec(sentences=all_words_in_sentences, vector_size=30, window=5, min_count=1, workers=4)

# list of array with vectorized snipet
list_of_vect_sentences = [word2vec.wv[all_words_in_sentences[i]] for i in range(len(all_words_in_sentences))]

# initialize the local homology transformer
lh = KNeighborsLocalVietorisRips(n_neighbors=(5, 15),
                                 homology_dimensions=(1,2),
                                 collapse_edges=True, 
                                 n_jobs = -1)

In [None]:
# example of a preprocessed sentence where "note" is used as a verb
print(refined_list[0])
lh.fit(list_of_vect_sentences[0])
modified_persistence_entropy(lh.transform(np.array([word2vec.wv["note"]], dtype=float)))

In [None]:
# example of preprocessed sentence where "note" refers to music
print(refined_list[1])
lh.fit(list_of_vect_sentences[1])
modified_persistence_entropy(lh.transform(np.array([word2vec.wv["note"]], dtype=float)))

In [None]:
# Imports that will help us visualize the data
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline
import umap

In [None]:
# Note as a verb

i=0 
value = None
# for loop to find the instance of note
for k in range(len(list_of_vect_sentences[i])):
    if (list_of_vect_sentences[i][k] == word2vec.wv["note"]).all():
        value = k
temp = np.zeros((len(list_of_vect_sentences[i])))
temp[value] = 1
print("Note is at the " + str(value) + "th position.")

reducer = umap.UMAP()

scaled_point_cloud = StandardScaler().fit_transform(list_of_vect_sentences[i])

embedding = reducer.fit_transform(scaled_point_cloud)

plt.scatter(
    embedding[:, 0],
    embedding[:, 1], c = temp)
plt.gca().set_aspect('equal', 'datalim')
plt.title('Use of "note" as a verb', fontsize=24)

# Example of sentence has "note" in it used as the verb
print("Preprocessed sentence: ")
print(refined_list[i])
lh.fit(list_of_vect_sentences[i])

print("First and second Betti numbers:")
print(modified_persistence_entropy(lh.transform(np.array([word2vec.wv["note"]], dtype=float))))


In [None]:
# Musical note
i=1
value=None
for k in range(len(list_of_vect_sentences[i])):
    if (list_of_vect_sentences[i][k] == word2vec.wv["note"]).all():
        value = k
temp = np.zeros((len(list_of_vect_sentences[i])))
temp[value] = 1

print("Note is at the " + str(value) + "th position.")

reducer = umap.UMAP()
scaled_point_cloud = StandardScaler().fit_transform(list_of_vect_sentences[i])

embedding = reducer.fit_transform(scaled_point_cloud)

plt.scatter(
    embedding[:, 0],
    embedding[:, 1], c = temp)
plt.gca().set_aspect('equal', 'datalim')
plt.title('Use of "note" referring to music', fontsize=24)

i=1
# This one uses the musical note
print("Preprocessed sentence: ")
print(refined_list[i])
lh.fit(list_of_vect_sentences[i])

print("First and second Betti numbers:")
print(modified_persistence_entropy(lh.transform(np.array([word2vec.wv["note"]], dtype=float))))
