In [59]:
from sentence_transformers import SentenceTransformer, util
import os
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
import re
import uuid

In [60]:
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    if "stop" in sentences[:-1]:
        sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [61]:
model = SentenceTransformer('all-MiniLM-L6-v2')

### Construct lookup table of 
* document_uuid to emotion
* Document_uuid to list of sentences
* Document_uuid tot list of sentence embeddings
* Document_uuid to whole and average embed
* sentenceblock_uuid to list of corresponding sentences
* sentence_block to whole and average embed


In [72]:
# hyperparameters:
window_size = 4
connection_threshold = 1.1 # we should construct some form of metric more rigorously here...
relaxation_threshold = 1.1
num_docs_to_consider_in_score = 5

In [73]:
def moving_average(a, n=3):
    if n > len(a):
        return np.mean(a, axis=0)
        
    ret = np.cumsum(a, dtype=float, axis=0)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

In [74]:
def moving_window_embedding(sentence_list, n=3):
    if n >= len(sentence_list):
        return model.encode(" ".join(sentence_list))
    
    embeddings = []
    for i in range(len(sentence_list)//n + 1):
        sentence_block = " ".join(sentence_list[i:i+n])
        embeddings.append(model.encode(sentence_block))
        
    return embeddings
        

In [99]:
doc_to_emotion = {} # docid: emotion
doc_to_source = {} # docid: source
doc_to_wordcounts = {} # docid: length
doc_to_sentences = {} # docid: [list of sentences]
doc_to_sentembeddings = {} # docid: {raw:[listoforderedembeddings], rolling_average:[listofslidingavgs], windowed:[listofwindoweds]}
doc_to_docembeddings = {} # docid: {'average':embed, 'whole':embed}

for directory in ["./friendship/", "./loneliness/"]:
    emotion = directory.split('/')[1]    
    for filename in os.listdir(directory):
        doc_id = uuid.uuid4()
        txtstring = open(directory + filename, 'r').read().split("|_|")
        whole_doc_text = txtstring[0].replace("\n", " ")
        doc_sentences = split_into_sentences(whole_doc_text)
        doc_sentences = [s for s in doc_sentences if (s != "." and s != '')]
        
        try:
            source = txtstring[1]
        except IndexError as e:
            source = "Unknown"
         
        sentence_embeddings = np.array(model.encode(doc_sentences))
        whole_embedding = model.encode(whole_doc_text)
        average_embedding = np.mean(sentence_embeddings, axis=0)
        raw_sentence_embeddings = sentence_embeddings
        rolling_average_sentence_embeddings = moving_average(raw_sentence_embeddings, window_size)
        sentence_block_embeddings = moving_window_embedding(doc_sentences, window_size)
        
        doc_to_emotion[doc_id] = emotion
        doc_to_source[doc_id] = source
        doc_to_wordcounts[doc_id] = len(whole_doc_text.split())
        doc_to_sentences[doc_id] = doc_sentences
        doc_to_docembeddings[doc_id] = {'average':average_embedding, 'whole':whole_embedding}
        doc_to_sentembeddings[doc_id] = {
            'raw': raw_sentence_embeddings,
            'moving_average': rolling_average_sentence_embeddings,
            'moving_window': sentence_block_embeddings
        }
            
        
        

In [126]:
journal = """
I went with my family yesterday to a beach in Palos Verdes.
I enjoyed seeing the way my parents interacted with each other, and love showing my little sister all the plants and stuff and watching her discover them for the first time.
I was like, "do I want kids"?
"""

journal = """
I worked on my homework quickly so I could join my friends at the party. Every year, my birthday is during dead week, so I never get to celebrate - however, this year, they surprised me with a cake! 
It was the first good birthday I had in college.
"""
journal = """
I have a friend who lived in Cali for school (I’m east coast.) while she was there, her parents FaceTimed her to tell her they were getting a divorce. She called me, and I immediately booked my $1,000 plane ticket for two days from then. That trip ended up being really fun, but it was so good to be able to be there for her.

A few years later, I lost my mom very suddenly. I didn’t know how to cope or move on with life. This same friend came over and spent every night at my apartment listening to me cry and try to figure out how to make funeral arrangements and estate decisions. As much as it sucked, it was awesome having such a good friend, and knowing that either of us would drop everything for each other.
"""

journal = """
Depressed. Relationship of 1.5 years ended a month ago and I'm still depressed about it. I'm just taking it day by day. 
"""

In [127]:
# on the journal entry, we do: wholeembedding? sliding window? or what?
whole_journal_embed = model.encode(journal)

journal_sentences = [s for s in split_into_sentences(journal) if s != "."]
journal_sentence_embeddings = np.array(model.encode(journal_sentences))
avg_journal_embed = np.mean(journal_sentence_embeddings, axis=0)


In [129]:
# now, we construct a connection score using nearest neighbors voting, filtering with threshold score.
# we construct the scores based on the whole journal embed? the average journal embed? the journal sentence embeddings?
# it makes most sense to test wholes and averages
# 4 possibilities then: wholel2whle, whole2avg, avg2avg, avg2whole
# what if we simply took the construction that was most robust across all 4?

w2w = []
a2a = []
a2w = []
w2a = []

# doc_ids_to_distance_from_journal = {}
doc_ids_distances = []
for doc_id, doc_embeddings in doc_to_docembeddings.items():
    avg_doc_embed = doc_embeddings['average']
    whole_doc_embed = doc_embeddings['whole']
    
    # we have to have some way of choosing NOT to add a message:
    avg2avg = np.linalg.norm(avg_journal_embed - avg_doc_embed)
    avg2whole = np.linalg.norm(avg_journal_embed - whole_doc_embed)
    whole2whole = np.linalg.norm(whole_journal_embed - whole_doc_embed)
    whole2avg = np.linalg.norm(whole_journal_embed - avg_doc_embed)
    
    mean_distance = sum([avg2avg, avg2whole, whole2whole, whole2avg])/4
#     doc_ids_distances.append((doc_id, mean_distance))
    
    w2w.append((whole2whole, doc_id))
    a2a.append((avg2avg, doc_id))
    a2w.append((avg2whole, doc_id))
    w2a.append((whole2avg, doc_id))
    

# find the doc that is, on average, closest!
sorted_closest_docs = sorted(a2a, key=lambda x: x[0])
closest_doc_id = sorted_closest_docs[0][1]

# we construct a connection score by looking at, on average, n the closest docs
connection_score = 0
for (distance, doc_id) in sorted_closest_docs[:num_docs_to_consider_in_score]:
    emotion = doc_to_emotion[doc_id]
    print(distance)
    connection_score += {'friendship':1, 'loneliness':-1}[emotion]

print(connection_score)




    
    

0.53073937
0.552917
0.58949
0.5953817
0.5976612
-5


In [83]:
# now, check for the word length/sentence count of the passage. let's cap it at 110 words? 
sentences_to_return = doc_to_sentences[closest_doc_id]
word_count = doc_to_wordcounts[closest_doc_id]

if word_count > 110:
    # if the passage is quite long, then identify the closest moving chunks:
    moving_window_embeds = doc_to_sentembeddings[closest_doc_id]['moving_window']
    moving_avg_embeds = doc_to_sentembeddings[closest_doc_id]['moving_average']
    
    # find closest 
    np.linalg.norm()
    

