In [217]:
# necessary packages in full (for now, still building of course)

import torch
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize
# nltk.download('punkt')

import re
import itertools

import os

import time # just for my own information

# this is just to get rid of an error message - something about how Windows caches the model I'm downloading from the internet
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [182]:
model = SentenceTransformer("thenlper/gte-small")

class Speech():

    regex_date_str = r'(19|20)\d{2}'

    def __init__(self, data, model):
        self.metadata = data[['ID', 'Title']]
        self.doc_body_raw = data['Document Body']
        self.document_body = [x for x in sent_tokenize(self.doc_body_raw) if len(x) > 30]
        self.model = model
        self.embeddings = self.model.encode(self.document_body)
        
        title = self.metadata['Title']
        date = re.search(self.regex_date_str, title)
        if date:
            self.metadata['Year'] = date.group()
        else:
            self.metadata['Year'] = "undated"

class Comparison():
    
    def __init__(self, speech1, speech2, model):
        self.speech1 = speech1
        self.speech2 = speech2
        self.model = model
        self.similarities = self.model.similarity(speech1.embeddings, speech2.embeddings)

    def find_similar_phrases(self, lower_bound = 0.000, upper_bound = 1.001):
        # the idea here is that one can enter a similarity threshold in float format and this will return a dataframe of all similar sentences
        # within that threshold, along with threshold scores
        # this can be called public facing but is also used in the later functions.
        
        indices = torch.nonzero((self.similarities >= lower_bound) & (self.similarities < upper_bound))

        if indices.shape[0] == 0:
            return pd.DataFrame()

        indices = indices.numpy()
        scores = self.similarities[indices[:, 0], indices[:, 1]].tolist()
        data = pd.DataFrame({
                'Index_1': indices[:, 0],
                'Index_2': indices[:, 1],
                'Speech_1': [self.speech1.document_body[i] for i in indices[:, 0]],
                'Speech_1_embedding': [self.speech1.embeddings[i] for i in indices[:, 0]],
                'Speech_2': [self.speech2.document_body[i] for i in indices[:, 1]],
                'Speech_2_embedding': [self.speech2.embeddings[i] for i in indices[:, 1]],
                'Similarity_score': [round(float(score), 3) for score in scores]
               })

        return data

    def similarity_stats(self):
        # here I want to find the average semantic similarity across two texts
        # this is not going to be a true average: rather, it will bin sentences by semantic similarity
        # I am thinking of implementing some sort of hash table to prevent a sentence from being compared to too many other sentences
        # I want to store these attributes to be used in show_comparison()

        # For now, let's just have two bins: above 0.95 and between 0.90-0.95
        near_identical = self.find_similar_phrases(lower_bound = 0.95)
        paraphrase = self.find_similar_phrases(lower_bound = 0.90, upper_bound = 0.95)

        # now I want to take near_identical as a percentage of each speech
        copy_rate_1 = round(len(near_identical) / len(self.speech1.document_body),3)
        copy_rate_2 = round(len(near_identical) / len(self.speech2.document_body),3)
        paraphrase_rate_1 = round(len(paraphrase) / len(self.speech1.document_body),3)
        paraphrase_rate_2 = round(len(paraphrase) / len(self.speech2.document_body),3)
        values = {'Near Match': len(near_identical), 'Paraphrase': len(paraphrase), 'NM - % of Speech 1': copy_rate_1,
                 'NM - % of Speech 2': copy_rate_2, 'P - % of Speech 1': paraphrase_rate_1, 'P - % of Speech 2': paraphrase_rate_2}
        return values
        

    def show_comparison(self):
        # here I want to create an array/list-like object to return from the comparison so we can compare across comparisons.
        # Eventually, this will contain more of the metadata, but for now it is limited.
        values = self.similarity_stats()

        # I may not work on this today, but it is important that we label each comparison array/list/whatever
        # with at a minimum the document IDs being compared
        # at a maximum I want IDs, titles, versions, dates

        d1 = {'speech1_ID': self.speech1.metadata['ID'], 'speech2_ID' : self.speech2.metadata['ID'],
             'speech1_year': self.speech1.metadata['Year'], 'speech2_year' : self.speech2.metadata['Year'],
             'speech1_title': self.speech1.metadata['Title'], 'speech2_title' : self.speech2.metadata['Title']}
        d1.update(values)
        return d1

In [15]:
documents = pd.read_csv('../document-catalog_extended.csv') # I have this from earlier - it's a CSV export of a bunch of speeches from my location project
documents = documents[documents['Workflow Stage'] == 'Early Access']
documents_test = documents[:20]
len(documents_test)

20

In [11]:
# this is the one that broke it
documents.iloc[318]

ID                                                                137
Title               Duplicate of ID84: Speech concerning challenge...
Box                                                    Box 4 Folder 6
Document Length                                                  12.0
Workflow Stage                                           Early Access
Image Filename                                                    NaN
Image Identifier                                                  NaN
Image URL                                                         NaN
Document Body                                                     NaN
Name: 389, dtype: object

In [17]:
len(documents)

358

In [19]:
documents.dropna(subset='Document Body', inplace=True)
len(documents)

357

In [52]:
# So, it looks like that was the only document that I actually needed to drop. 
# But useful to have that code snippet in there for future reference.

In [41]:
speeches = []
start_time = time.time()
for index, row in documents_test.iterrows():
    new_speech = Speech(row, model)
    speeches.append(new_speech)

end_time = time.time()
print(round(end_time - start_time, 2))

29.39


In [43]:
speeches

[<__main__.Speech at 0x2a29b917830>,
 <__main__.Speech at 0x2a29b917080>,
 <__main__.Speech at 0x2a29b914320>,
 <__main__.Speech at 0x2a29b916480>,
 <__main__.Speech at 0x2a29b915310>,
 <__main__.Speech at 0x2a29b917350>,
 <__main__.Speech at 0x2a29b916510>,
 <__main__.Speech at 0x2a29b915dc0>,
 <__main__.Speech at 0x2a29b914f20>,
 <__main__.Speech at 0x2a29b914d70>,
 <__main__.Speech at 0x2a29b9145c0>,
 <__main__.Speech at 0x2a29b914aa0>,
 <__main__.Speech at 0x2a29b916b10>,
 <__main__.Speech at 0x2a29b9179b0>,
 <__main__.Speech at 0x2a29793cb60>,
 <__main__.Speech at 0x2a29793d280>,
 <__main__.Speech at 0x2a29793ddc0>,
 <__main__.Speech at 0x2a29793fd70>,
 <__main__.Speech at 0x2a29793de50>,
 <__main__.Speech at 0x2a29793c800>]

In [45]:
pairs = itertools.combinations(range(len(speeches)), 2)

comps_dicts = []

for i, j in pairs:
    new_comp = Comparison(speeches[i], speeches[j], model)
    new_comp_dict = new_comp.show_comparison()
    comps_dicts.append(new_comp_dict)

In [49]:
df = pd.DataFrame(comps_dicts)
df.sort_values(by = ['Near Match', 'Paraphrase'], ascending = False).head(20)

Unnamed: 0,speech1_ID,speech2_ID,speech1_year,speech2_year,speech1_title,speech2_title,Near Match,Paraphrase,NM - % of Speech 1,NM - % of Speech 2,P - % of Speech 1,P - % of Speech 2
19,667,666,1972,1972,Speeches making observations about the recent ...,Speeches making observations about the recent ...,112,10,0.619,0.949,0.055,0.085
184,613,612,1972,1972,"Speech concerning political action, black peop...","Speech concerning political action, black peop...",96,12,1.079,0.756,0.135,0.094
70,663,662,1972,1972,Speech about the upcoming presidential electio...,(Draft) Speech about the upcoming presidential...,95,15,0.896,0.896,0.142,0.142
71,663,661,1972,1972,Speech about the upcoming presidential electio...,(Draft) Speech about the upcoming presidential...,92,13,0.868,0.836,0.123,0.118
85,662,661,1972,1972,(Draft) Speech about the upcoming presidential...,(Draft) Speech about the upcoming presidential...,91,16,0.858,0.827,0.151,0.145
169,616,615,1972,1972,Speech concerning the political status of educ...,Speech concerning the political status of educ...,86,6,0.723,0.86,0.05,0.06
175,615,614,1972,1972,Speech concerning the political status of educ...,Speech concerning the political status of educ...,83,8,0.83,0.735,0.08,0.071
37,666,665,1972,1972,Speeches making observations about the recent ...,Speeches making observations about the recent ...,81,16,0.686,0.623,0.136,0.123
170,616,614,1972,1972,Speech concerning the political status of educ...,Speech concerning the political status of educ...,81,8,0.681,0.717,0.067,0.071
20,667,665,1972,1972,Speeches making observations about the recent ...,Speeches making observations about the recent ...,78,27,0.431,0.6,0.149,0.208


In [54]:
speeches_full = []
counter = 0
start_time = time.time()
for index, row in documents.iterrows():
    new_speech = Speech(row, model)
    speeches_full.append(new_speech)
    counter += 1
    if counter % 25 == 0:
        current = time.time()
        elapsed = round((current - start_time), 2)
        print(f'Progress: tokenizing speech #{counter} - elapsed time: {elapsed} seconds')

Progress: tokenizing speech #25 - elapsed time: 40.59 seconds
Progress: tokenizing speech #50 - elapsed time: 82.13 seconds
Progress: tokenizing speech #75 - elapsed time: 131.94 seconds
Progress: tokenizing speech #100 - elapsed time: 166.82 seconds
Progress: tokenizing speech #125 - elapsed time: 217.38 seconds
Progress: tokenizing speech #150 - elapsed time: 262.63 seconds
Progress: tokenizing speech #175 - elapsed time: 309.54 seconds
Progress: tokenizing speech #200 - elapsed time: 346.44 seconds
Progress: tokenizing speech #225 - elapsed time: 386.6 seconds
Progress: tokenizing speech #250 - elapsed time: 424.52 seconds
Progress: tokenizing speech #275 - elapsed time: 456.73 seconds
Progress: tokenizing speech #300 - elapsed time: 481.17 seconds
Progress: tokenizing speech #325 - elapsed time: 514.92 seconds
Progress: tokenizing speech #350 - elapsed time: 551.19 seconds


In [56]:
len(speeches_full)

357

In [184]:
pairs_full = itertools.combinations(range(len(speeches_full)), 2)

comps_full = []

counter = 0
start_time = time.time()
for i, j in pairs_full:
    new_comp = Comparison(speeches_full[i], speeches_full[j], model)
    new_comp_dict = new_comp.show_comparison()
    comps_full.append(new_comp_dict)
    counter += 1
    if counter % 5000 == 0:
        current = time.time()
        elapsed = round((current - start_time), 2)
        print(f'Progress: making pairwise comparison #{counter} - elapsed time: {elapsed} seconds')

Progress: making pairwise comparison #5000 - elapsed time: 6.15 seconds
Progress: making pairwise comparison #10000 - elapsed time: 12.04 seconds
Progress: making pairwise comparison #15000 - elapsed time: 18.42 seconds
Progress: making pairwise comparison #20000 - elapsed time: 24.59 seconds
Progress: making pairwise comparison #25000 - elapsed time: 32.01 seconds
Progress: making pairwise comparison #30000 - elapsed time: 38.17 seconds
Progress: making pairwise comparison #35000 - elapsed time: 45.19 seconds
Progress: making pairwise comparison #40000 - elapsed time: 51.61 seconds
Progress: making pairwise comparison #45000 - elapsed time: 58.14 seconds
Progress: making pairwise comparison #50000 - elapsed time: 64.13 seconds
Progress: making pairwise comparison #55000 - elapsed time: 70.03 seconds
Progress: making pairwise comparison #60000 - elapsed time: 76.19 seconds


In [186]:
final_data = pd.DataFrame(comps_full)

In [188]:
final_data.head()

Unnamed: 0,speech1_ID,speech2_ID,speech1_year,speech2_year,speech1_title,speech2_title,Near Match,Paraphrase,NM - % of Speech 1,NM - % of Speech 2,P - % of Speech 1,P - % of Speech 2
0,670,667,1969,1972,Undated Speech concerning Conditions of Black ...,Speeches making observations about the recent ...,0,9,0.0,0.0,0.111,0.05
1,670,666,1969,1972,Undated Speech concerning Conditions of Black ...,Speeches making observations about the recent ...,0,4,0.0,0.0,0.049,0.034
2,670,665,1969,1972,Undated Speech concerning Conditions of Black ...,Speeches making observations about the recent ...,0,6,0.0,0.0,0.074,0.046
3,670,663,1969,1972,Undated Speech concerning Conditions of Black ...,Speech about the upcoming presidential electio...,0,1,0.0,0.0,0.012,0.009
4,670,662,1969,1972,Undated Speech concerning Conditions of Black ...,(Draft) Speech about the upcoming presidential...,0,1,0.0,0.0,0.012,0.009


In [190]:
final_data.sort_values(by = ['Near Match', 'Paraphrase'], ascending = False).head(20)

Unnamed: 0,speech1_ID,speech2_ID,speech1_year,speech2_year,speech1_title,speech2_title,Near Match,Paraphrase,NM - % of Speech 1,NM - % of Speech 2,P - % of Speech 1,P - % of Speech 2
51666,306,230,1977,1978,Speech Filed with Versions of Speech concernin...,Copy: Speech concerning education and the Bakk...,379,165,1.027,1.03,0.447,0.448
14091,537,536,2004,2004,"Speech delivered to the 2004 NAACP Convention,...",Draft Speech delivered to the 2004 NAACP Conve...,316,64,0.943,0.96,0.191,0.195
37443,398,402,1992,1993,Speech concerning Civil Rights movement and af...,Speech concerning how the new administration h...,247,34,0.834,0.765,0.115,0.105
33419,406,415,1994,1995,Speech delivered at the New Jersey Black Issue...,Speech concerning the recent elections and the...,232,34,0.991,0.859,0.145,0.126
40127,385,372,1990,1989,Speech concerning the struggle of black people...,"Speech- ""A Historical Perspective of a People""...",228,25,0.987,0.996,0.108,0.109
43646,371,372,1989,1989,"Speech- ""A Historical Perspective of a People""...","Speech- ""A Historical Perspective of a People""...",226,25,0.922,0.987,0.102,0.109
40126,385,371,1990,1989,Speech concerning the struggle of black people...,"Speech- ""A Historical Perspective of a People""...",225,28,0.974,0.918,0.121,0.114
37454,398,388,1992,1991,Speech concerning Civil Rights movement and af...,"Speech- ""Black Politics, Now and Then"" deliver...",218,14,0.736,0.71,0.047,0.046
30907,424,428,1998,1998,"Speech- ""At the Turn of Two Centuries"" Keynote...","Speech- ""Civil Rights, Now & Then"" at the Nati...",217,62,0.728,0.759,0.208,0.217
31415,428,430,1998,1998,"Speech- ""Civil Rights, Now & Then"" at the Nati...","Speech- ""Civil Rights, Now & Then"" delivered a...",215,61,0.752,0.86,0.213,0.244


In [108]:
# so, this is cool. And we have a lot of stuff. But there is so much stuff that it might be too much. And there are some significant flaws
# for example, once I find something interesting, I can't then look more closely at the comparison object easily.
# Also, the speech object takes a long time to make, so I need a better way to store the content after it has been tokenized. 
# Probably in a JSON or CSV format. In essence, this just needs some extensive checkpointing. 
# And then once the checkpoints have been checked and pointed and all, I will be able to figure out what the hell to do with all this information.
# Currently having information overload.
# I need to email Lucian.

In [134]:
final_data[final_data.speech1_ID==227].sort_values(by = ['Near Match', 'Paraphrase'], ascending = False).head()

Unnamed: 0,speech1_ID,speech2_ID,speech1_year,speech2_year,speech1_title,speech2_title,Near Match,Paraphrase,NM - % of Speech 1,NM - % of Speech 2,P - % of Speech 1,P - % of Speech 2
58395,227,228,1985,1985,"Speech about poverty, 1985 (2 of 4)","Speech about poverty, 1985 (3 of 4)",180,135,0.978,0.978,0.734,0.734
58396,227,229,1985,1985,"Speech about poverty, 1985 (2 of 4)","Speech about poverty, 1985 (4 of 4)",180,135,0.978,1.017,0.734,0.763
58420,227,182,1985,1975,"Speech about poverty, 1985 (2 of 4)",Speech concerning the 200th Birthday of the Un...,3,6,0.016,0.029,0.033,0.058
58419,227,181,1985,1975,"Speech about poverty, 1985 (2 of 4)",Speech concerning the 200th Birthday of the Un...,3,5,0.016,0.027,0.027,0.044
58417,227,200,1985,1976,"Speech about poverty, 1985 (2 of 4)",Speech concerning black America: its progress ...,3,4,0.016,0.024,0.022,0.032


In [192]:
targets = [x for x in speeches_full if x.metadata['ID'] == 227 or x.metadata['ID'] == 182]

In [194]:
targets_comparison = Comparison(targets[0], targets[1], model)

In [200]:
targets_df = targets_comparison.find_similar_phrases(lower_bound = 0.9)
targets_df

Unnamed: 0,Index_1,Index_2,Speech_1,Speech_1_embedding,Speech_2,Speech_2_embedding,Similarity_score
0,8,33,"In the last four years, the number of people l...","[-0.026062313, -0.023298213, 0.041172795, -0.0...","In the decade of the sixties, the number of pe...","[-0.016661666, 0.023746373, 0.023605332, 0.002...",0.909
1,45,33,That describes the number of people and the br...,"[-0.029002134, 0.0007936165, 0.049975, -0.0101...","In the decade of the sixties, the number of pe...","[-0.016661666, 0.023746373, 0.023605332, 0.002...",0.903
2,103,33,The number of percentage of Americans living i...,"[-0.0031937, 0.007489503, 0.01423528, 0.012691...","In the decade of the sixties, the number of pe...","[-0.016661666, 0.023746373, 0.023605332, 0.002...",0.942
3,145,46,"440,000 children have lost compensatory educat...","[-0.009392483, -0.017864015, 0.028127734, -0.0...",Schools no longer educate our young.,"[-0.008303627, 0.008497247, 0.061195686, -0.01...",0.902
4,153,49,More black Americans lived in poverty in 1983 ...,"[-0.020097407, 0.003728544, 0.0016451224, 0.01...","For Black America, it is still true, despite t...","[0.004466836, -0.005196769, 0.00016377226, 0.0...",0.901
5,162,30,"But if the years that went before, the Kennedy...","[-0.041930065, 0.0026374531, 0.06262744, -0.02...","But if the years before, the Kennedy and Johns...","[-0.0436134, 0.0028235426, 0.054202866, -0.010...",0.985
6,164,36,If we are to believe with Thomas Jefferson tha...,"[-0.038166277, -0.013557173, 0.03234451, -0.00...",If we are to believe with Thomas Jefferson tha...,"[-0.042803403, -0.012504759, 0.028365914, -0.0...",0.995
7,165,37,Human problems are now placed on a balance she...,"[-0.08290427, -0.0070931297, 0.042999815, 0.00...",Human problems are now placed on a balance she...,"[-0.088925555, -0.0110423295, 0.040722143, -0....",0.987
8,166,70,There can be no better prescription for reliev...,"[-0.035108846, 0.00033446125, 0.0146759655, -0...",There can be no better prescription for reliev...,"[-0.037516475, -0.031255446, -0.0066589788, -0...",0.912


In [206]:
# obviously this is super ugly but I gotta use these embeddings to compare sentences across the whole corpus

In [202]:
targets_df.loc[6]['Speech_1'], targets_df.loc[6]['Speech_2']

('If we are to believe with Thomas Jefferson that the common man is "The most precious portion of the state", we find that resource in danger of economic extinction today.',
 'If we are to believe with Thomas Jefferson that the common man is "the most precious portion of the state," we find that precious resource in real danger of economic extinction.')

In [243]:
targets_df.loc[5]['Speech_1'], targets_df.loc[5]['Speech_2']

('But if the years that went before, the Kennedy, Johnson and Carter years, taught us any kind of lesson at all, tiit ought to have been that government, under militant and concerted pressure, would move, all too often with all deliberate lack of speed, to become a limited partner of sorts with the American underclass in their struggle to do better for themselves.',
 'But if the years before, the Kennedy and Johnson years, taught us any lesson at all, it ought to have been that government, under militant and concerted pressure, would move, slowly and rather ponderously, all too often with all deliberate lack of speed, to become a limited partner of sorts with the American underclass in their struggle to do better for themselves.')

So, it took a long time to unearth this again, but it's pretty crazy that these lines show up 10 years apart! Also reflects a pretty brutal 10 year stretch for Julian's general outlook on the country, it seems. I just need to figure out how to get what I want. 

1. How do I find every instance of this sentiment? This seems to be a hashing and incrementation issue. So I need a hash table that has an arbitrary key and a non-static list DT as its value, and I need to regularly append things to the list in an orderly manner. This might need to be further nested, too - I need a way to track at least one of the embeddings (call it the "source" embedding, so I'll need to sort the speeches chronologically), so I can compare it (can I make an embedding a key? That seems very messy). Store the embedding in a tensor? Can I append things to a tensor?

* look into appending things to a tensor. Issue is, need to figure out a way to not lose track of which sentence is which. I solved this in pairwise. So I can probably solve this again.
* Previous solution involved an object attribute that tracked embeddings and one that tracked sentences. Should I create a Speech object that only contains embeddings that appear at the NM threshold in at least two docs? Or (this is probably better) make a whole new class of objects. Am I being Pythonic? No I am not. Need to improve upon that.

2. How do I store the "index" (relative positioning of this sentence within its speech), along with sufficient metadata to identify the speech? This seems like a problem for a dictionary within the list, with three items: sentence, doc ID, sentence ID. Make what's essentially a relational database that contains a "document" table, that I can use to reference this stuff.

* Document table, with primary key ID (references doc ID in JBPP system). 

In [172]:
# let's start by saving this stuff as a CSV file, so that I can retain at least some of the comparison stuff once I turn this kernel off.
# only needs to be run once

final_data.to_csv('series_1_comparisons.csv')

In [237]:
sims = model.similarity(targets_df['Speech_1_embedding'], targets_df['Speech_2_embedding'])
sims

tensor([[0.9091, 0.9091, 0.9091, 0.7942, 0.8661, 0.7670, 0.7890, 0.8085, 0.8339],
        [0.9033, 0.9033, 0.9033, 0.7807, 0.8269, 0.7710, 0.7924, 0.7832, 0.8024],
        [0.9420, 0.9420, 0.9420, 0.8266, 0.8805, 0.7923, 0.8056, 0.7945, 0.8244],
        [0.8067, 0.8067, 0.8067, 0.9019, 0.8309, 0.7770, 0.7598, 0.8065, 0.8065],
        [0.8803, 0.8803, 0.8803, 0.7742, 0.9006, 0.7655, 0.7682, 0.7922, 0.8045],
        [0.8051, 0.8051, 0.8051, 0.8291, 0.7929, 0.9847, 0.8109, 0.7801, 0.8219],
        [0.7871, 0.7871, 0.7871, 0.7728, 0.7957, 0.7997, 0.9953, 0.7666, 0.8186],
        [0.7828, 0.7828, 0.7828, 0.7624, 0.7714, 0.7797, 0.7829, 0.9866, 0.8052],
        [0.7732, 0.7732, 0.7732, 0.7642, 0.7592, 0.8166, 0.8046, 0.7713, 0.9122]])

In [170]:
# want to append common sentences (threshold > 0.95) to a list 
# and compare those sentences to all the embeddings in other documents but also retain metadata

# initialize:
speech_1 = speeches_full[0]
speech_2 = speeches_full[1]


for i, j in pairs_full:
    new_comp = Comparison(speeches_full[i], speeches_full[j], model)
    comp_df = new_comp.find_similar_phrases(lower_bound = 0.95)
    new_comp_dict = new_comp.show_comparison()
    comps_full.append(new_comp_dict)
    counter += 1
    if counter % 5000 == 0:
        current = time.time()
        elapsed = round((current - start_time), 2)
        print(f'Progress: making pairwise comparison #{counter} - elapsed time: {elapsed} seconds')

[ID                                                     353
 Title    Speech concerning Georgia's education system, ...
 Year                                                  1985
 Name: 239, dtype: object,
 ID                                       227
 Title    Speech about poverty, 1985 (2 of 4)
 Year                                    1985
 Name: 326, dtype: object,
 ID                                       228
 Title    Speech about poverty, 1985 (3 of 4)
 Year                                    1985
 Name: 327, dtype: object,
 ID                                       229
 Title    Speech about poverty, 1985 (4 of 4)
 Year                                    1985
 Name: 328, dtype: object]