# Final Project Code

We can use this notebook to keep track of all our analysis and conclusions. For now, we should just create dataframes from our processed words. Eventually, we'll import code and stuff from each individual section (i.e. Sentiment Analysis, Heatmaps, etc.)

In [2]:
# Importing Libraries
import numpy as np
import pandas as pd
import collections as c

### Words -> DataFrames

For each of our texts, we construct a DataFrame using wordstems as indexes and a column to denote the number of occurrences of a wordstem in the text.

In [28]:
# Four Noble Truths
with open("Processed_Texts/fournobletruths_words.txt", encoding='utf8') as f:
    words = f.read().splitlines()
#print(words) #this is now an array of all the wordstems
fnt_string = " ".join(words)   
fnt_count = c.Counter(words)
word_freq = dict(c.Counter(words))
fnt_df = pd.DataFrame(word_freq.values(), word_freq.keys(), ["count"]).sort_values("count", ascending=False)
fnt_df.head()

Unnamed: 0,count
right,136
mind,132
feel,115
stress,98
monk,92


In [29]:
# Lotus Sutra
with open("Processed_Texts/lotussutra_words.txt", encoding='utf8') as f:
    words = f.read().splitlines()
ls_string = " ".join(words)
ls_count = c.Counter(words)
word_freq = dict(c.Counter(words))
ls_df = pd.DataFrame(word_freq.values(), word_freq.keys(), ["count"]).sort_values("count", ascending=False)
ls_df.head()

Unnamed: 0,count
buddha,1375
dharma,557
bodhisattva,556
teach,477
great,426


In [30]:
# Tao Te Ching
with open("Processed_Texts/taoteching_words.txt", encoding='utf8') as f:
    words = f.read().splitlines()
ttc_string = " ".join(words)
ttc_count = c.Counter(words)
word_freq = dict(c.Counter(words))
ttc_df = pd.DataFrame(word_freq.values(), word_freq.keys(), ["count"]).sort_values("count", ascending=False)
ttc_df.head()

Unnamed: 0,count
tao,84
thing,64
one,52
know,48
great,46


In [31]:
# Upanishads
with open("Processed_Texts/upanishads_words.txt", encoding='utf8') as f:
    words = f.read().splitlines()

u_string = " ".join(words)
word_freq = dict(c.Counter(words))
u_df = pd.DataFrame(word_freq.values(), word_freq.keys(), ["count"]).sort_values("count", ascending=False)
u_df.head()

Unnamed: 0,count
one,110
know,100
self,85
mind,81
brahman,76


In [32]:
# Yogasutras
with open("Processed_Texts/yogasutras_words.txt", encoding='utf8') as f:
    words = f.read().splitlines()

ys_string = " ".join(words)
word_freq = dict(c.Counter(words))
ys_df = pd.DataFrame(word_freq.values(), word_freq.keys(), ["count"]).sort_values("count", ascending=False)
ys_df.head()

Unnamed: 0,count
spiritu,322
man,245
power,198
life,156
conscious,149


### Sentiment Analysis

### Heatmaps/Text Visualizations

In [20]:
# https://stackoverflow.com/questions/28819272/python-how-to-calculate-the-cosine-similarity-of-two-word-lists
# convert to word-vectors
words  = list(fnt_count.keys() | ls_count.keys())
fnt_vect = [fnt_count.get(word, 0) for word in words]       
ls_vext = [ls_count.get(word, 0) for word in words]        

# find cosine
len_a  = sum(av*av for av in fnt_vect) ** 0.5            
len_b  = sum(bv*bv for bv in ls_vext) ** 0.5             
dot    = sum(av*bv for av,bv in zip(fnt_vect, ls_vext))   
cosine = dot / (len_a * len_b)                          
cosine

0.20894492369253445

In [23]:
# convert to word-vectors
words  = list(ttc_count.keys() | ls_count.keys())
ttc_vect = [ttc_count.get(word, 0) for word in words]       
ls_vext = [ls_count.get(word, 0) for word in words]        

# find cosine
len_a  = sum(av*av for av in ttc_vect) ** 0.5            
len_b  = sum(bv*bv for bv in ls_vext) ** 0.5             
dot    = sum(av*bv for av,bv in zip(ttc_vect, ls_vext))   
cosine = dot / (len_a * len_b)                          
cosine

0.2917583276957685

In [33]:
books = [fnt_string,ttc_string, u_string, ls_string, ys_string]

In [42]:
book_names = ["ys", "fnt", "ttc", "up", "ls"]
pairs = []
for i, v in enumerate(book_names):
    for j in book_names[i+1:]:
        pairs.append((book_names[i], j))
print(pairs)

[('ys', 'fnt'), ('ys', 'ttc'), ('ys', 'up'), ('ys', 'ls'), ('fnt', 'ttc'), ('fnt', 'up'), ('fnt', 'ls'), ('ttc', 'up'), ('ttc', 'ls'), ('up', 'ls')]


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=0.0, max_df=1.0, ngram_range=(1,1))
# calculate the feature matrix
feature_matrix = vectorizer.fit_transform(books).astype(float)

In [44]:
def compute_cosine_similarity(pair):
    
    # extract the indexes from the pair
    book1, book2 = pair
    
    # split on _ and get index
    book1_index = int(book1.split("_")[1])
    book2_index = int(book2.split("_")[1])
    
    # get the feature matrix of the document
    book1_fm = feature_matrix.toarray()[book1_index]
    book2_fm = feature_matrix.toarray()[book2_index]
    
    # compute cosine similarity manually
    manual_cosine_similarity = np.dot(book1_fm, book2_fm)
    
    return manual_cosine_similarity

In [45]:
pairwise_cosine_similarity = [compute_cosine_similarity(pair) for pair in pairs]

# create a dataframe
df = pd.DataFrame({'pair': pairs, 'similarity': pairwise_cosine_similarity})
display(df.head())
display(df.tail())

IndexError: list index out of range

### Word Distances