In [1]:
# necessary packages in full (for now, still building of course)

import torch
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize
# nltk.download('punkt')

import re
import itertools

import os

import time # just for my own information

# this is just to get rid of an error message - something about how Windows caches the model I'm downloading from the internet
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [3]:
# instantiate global variables
model = SentenceTransformer("thenlper/gte-small")

regex_year_str = r'(19|20)\d{2}'

# read in data
documents = pd.read_csv('../document-catalog_extended.csv')
documents = documents[documents['Workflow Stage'] == 'Early Access']
documents.dropna(subset='Document Body', inplace=True)

In [5]:
documents.drop(['Box', 'Document Length', 'Workflow Stage', 'Image Filename', 'Image Identifier', 'Image URL'], axis=1, inplace=True)

In [7]:
documents.reset_index(inplace=True)
documents.drop('index', axis=1, inplace=True)

In [9]:
documents.head()

Unnamed: 0,ID,Title,Document Body
0,670,Undated Speech concerning Conditions of Black ...,[This speech includes pages with many differen...
1,667,Speeches making observations about the recent ...,"Now that the nation's voters — at least, 54% o..."
2,666,Speeches making observations about the recent ...,"Now that the nation's voters — at least, 54% o..."
3,665,Speeches making observations about the recent ...,"1\nNow that the nations voters — at least, 54%..."
4,663,Speech about the upcoming presidential electio...,The election approaching on November seventh i...


In [11]:
sentences = documents['Document Body']\
                .apply(lambda x: pd.Series([x for x in sent_tokenize(x) if len(x) >= 30], dtype='string'))\
                .stack()\
                .to_frame('sent_str')

In [13]:
documents.index.name = "index"
documents.head()

Unnamed: 0_level_0,ID,Title,Document Body
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,670,Undated Speech concerning Conditions of Black ...,[This speech includes pages with many differen...
1,667,Speeches making observations about the recent ...,"Now that the nation's voters — at least, 54% o..."
2,666,Speeches making observations about the recent ...,"Now that the nation's voters — at least, 54% o..."
3,665,Speeches making observations about the recent ...,"1\nNow that the nations voters — at least, 54%..."
4,663,Speech about the upcoming presidential electio...,The election approaching on November seventh i...


In [290]:
documents.iloc[0,1]

'Undated Speech concerning Conditions of Black People and Remedies of New Politics, 1969?'

In [302]:
years = []
for i in range(len(documents)):
    title = documents.iloc[i, 1]
    date = re.search(regex_year_str, title)
    if date:
        year = date.group()
        year = int(year)
    else:
        year = np.nan
    years.append(year)

In [304]:
documents['Year'] = years

In [318]:
documents['Year'] = documents['Year'].astype('Int64')

In [320]:
documents

Unnamed: 0_level_0,ID,Title,Document Body,Year
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,670,Undated Speech concerning Conditions of Black ...,[This speech includes pages with many differen...,1969
1,667,Speeches making observations about the recent ...,"Now that the nation's voters — at least, 54% o...",1972
2,666,Speeches making observations about the recent ...,"Now that the nation's voters — at least, 54% o...",1972
3,665,Speeches making observations about the recent ...,"1\nNow that the nations voters — at least, 54%...",1972
4,663,Speech about the upcoming presidential electio...,The election approaching on November seventh i...,1972
...,...,...,...,...
352,83,Speech concerning black bankers and businesses...,"Before I begin, let me tell you my financial q...",1972
353,84,Speech concerning challenges facing the Americ...,Now that the nation has changed had a change o...,1972
354,85,Speech from the Alabama Voter Education Projec...,Thank you a great deal. As John Lewis has told...,1972
355,86,Speech at Press Conference in Washington conce...,[Handwritten.]\none the point of today's conce...,1973


In [15]:
sentences.head()

Unnamed: 0,Unnamed: 1,sent_str
0,0,[This speech includes pages with many differen...
0,1,"We need to discover who is, and who isn't viol..."
0,2,Violence is black children going to school for...
0,3,Violence is 30 million hungry stomachs in the ...
0,4,Violence is having black people represent a di...


In [17]:
sentences = sentences.rename_axis(index = ['doc_index', 'sent_num'])

In [19]:
sentences.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sent_str
doc_index,sent_num,Unnamed: 2_level_1
0,0,[This speech includes pages with many differen...
0,1,"We need to discover who is, and who isn't viol..."
0,2,Violence is black children going to school for...
0,3,Violence is 30 million hungry stomachs in the ...
0,4,Violence is having black people represent a di...


In [21]:
print(type(sentences['sent_str']))

<class 'pandas.core.series.Series'>


In [27]:
start = time.time()
embeddings = model.encode(sentences['sent_str'].reset_index(drop=True))
end = time.time()
print(f'Runtime: {round(end-start, 3)} seconds')

Runtime: -385.645 seconds


Creating the embeddings seems to take a little over six minutes, which is not too bad but not necessarily scalable to the entire corpus. But for speeches, this is viable. When Lucian discussed this framework, he did say that it took a very long time to generate these embeddings. I definitely want to store these somehow so I don't have to re-do this step every time.

In [30]:
sentences['embedding'] = list(embeddings)

In [32]:
sentences.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sent_str,embedding
doc_index,sent_num,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,[This speech includes pages with many differen...,"[-0.062395636, 0.0135172205, 0.045818355, -0.0..."
0,1,"We need to discover who is, and who isn't viol...","[-0.020895261, -0.008539446, 0.029561546, -0.0..."
0,2,Violence is black children going to school for...,"[0.00194369, -0.006336346, 0.035046395, -0.005..."
0,3,Violence is 30 million hungry stomachs in the ...,"[0.0031232794, -0.011578105, 0.041491807, -0.0..."
0,4,Violence is having black people represent a di...,"[-0.0017653363, 0.026803194, 0.014218208, -0.0..."


In [43]:
len(embeddings)

45850

In [45]:
len(sentences.sent_str)

45850

In [51]:
embeddings_id = np.arange(len(embeddings))
sentences['embeddings_id'] = embeddings_id
sentences.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sent_str,embedding,embeddings_id
doc_index,sent_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,[This speech includes pages with many differen...,"[-0.062395636, 0.0135172205, 0.045818355, -0.0...",0
0,1,"We need to discover who is, and who isn't viol...","[-0.020895261, -0.008539446, 0.029561546, -0.0...",1
0,2,Violence is black children going to school for...,"[0.00194369, -0.006336346, 0.035046395, -0.005...",2
0,3,Violence is 30 million hungry stomachs in the ...,"[0.0031232794, -0.011578105, 0.041491807, -0.0...",3
0,4,Violence is having black people represent a di...,"[-0.0017653363, 0.026803194, 0.014218208, -0.0...",4


I wonder if this might pose an issue for storage. Like, should I store the embeddings separately and just store the embedding index in the sentences dataframe? I think that's the best practice. We'll figure that one out.

In [35]:
start = time.time()
similarities = model.similarity(embeddings, embeddings)
end = time.time()
print(f'Runtime: {round(end-start, 3)} seconds.')

Runtime: 10.838 seconds.


In [79]:
indices = torch.nonzero(similarities >= 0.95)
indices = indices.numpy()

In [81]:
len(indices)

197380

In [83]:
indices[indices[:,0] == 1]

array([[    1,     1],
       [    1,  2569],
       [    1,  2691],
       [    1,  3562],
       [    1, 11712],
       [    1, 40059]], dtype=int64)

In [85]:
indices[indices[:,1] == 1]

array([[    1,     1],
       [ 2569,     1],
       [ 2691,     1],
       [ 3562,     1],
       [11712,     1],
       [40059,     1]], dtype=int64)

In [113]:
counter = []
for i in indices[indices[:,0] == 1]:
    counter.append(i[1])
counter

[2569, 2691, 3562, 11712, 40059]

In [87]:
# so, I think I want to filter the indices so at the very least the comparisons between the same sentence are eliminated
# I think I also want to filter out duplicate pairs, but less sure about that
# but we'll write the code for it
indices = indices[indices[:, 0] != indices[:, 1]]
indices = indices[indices[:, 0] < indices[:, 1]]

In [89]:
len(indices)

75765

Excellent! That eliminated a lot of duplicates and should make this next part easy. This is also just looking at Near Match language - paraphrasing will expand this a lot, I'm sure.

In [91]:
scores = similarities[indices[:, 0], indices[:, 1]].tolist()

In [104]:
scores[:10]

[0.9655367732048035,
 0.975104808807373,
 0.975104808807373,
 0.9957187175750732,
 0.9561127424240112,
 0.9923681020736694,
 0.9979069828987122,
 0.9979069828987122,
 0.995079517364502,
 1.0]

### Next Step: finding common sentences across the corpus

Also, need to pull out examples that I can showcase. The final step will be doing a join between the sentences table and the documents table. This is not exactly third NF, so I don't really even know what to call it.

In [117]:
# let's make a copy so I don't mess anything up

matches_counter = []
matches_indices = []

start = time.time()
for i in sentences.embeddings_id:
    counter = 0
    nm_list = []
    temp_ind = indices[indices[:,0] == i]
    for j in temp_ind:
        counter += 1
        nm_list.append(j[1])
    matches_counter.append(counter)
    matches_indices.append(nm_list)
end = time.time()
print(f'Runtime: {round(end-start, 3)} seconds.')

Runtime: 5.751 seconds.


In [121]:
len(matches_counter)

45850

In [123]:
len(matches_indices)

45850

In [125]:
sentences['matches_counter'] = matches_counter
sentences['matches_indices'] = matches_indices

In [131]:
sentences.sort_values(by = 'matches_counter', ascending = False).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,sent_str,embedding,embeddings_id,matches_counter,matches_indices
doc_index,sent_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
37,41,[Two handwritten X marks appear in the right m...,"[-0.06447188, -0.011960898, 0.049505513, -0.00...",4170,57,"[4270, 4285, 4331, 4431, 4515, 4563, 4569, 463..."
37,141,[A handwritten X mark appears in the right mar...,"[-0.0653786, -0.010280782, 0.045581914, -0.008...",4270,55,"[4285, 4331, 4431, 4515, 4563, 4569, 4639, 472..."
37,27,[A handwritten X mark appears in the right mar...,"[-0.074868046, -0.007574868, 0.04765481, -0.00...",4156,54,"[4170, 4270, 4285, 4331, 4431, 4515, 4563, 456..."
38,126,[Two handwritten X marks appear in the right m...,"[-0.06447186, -0.011960899, 0.0495055, -0.0057...",4431,53,"[4515, 4563, 4569, 4639, 4725, 4752, 4782, 479..."
37,156,[Two handwritten X marks appear in the right m...,"[-0.06612751, -0.00894151, 0.048094306, -0.004...",4285,53,"[4331, 4431, 4515, 4563, 4569, 4639, 4725, 475..."
35,42,[A handwritten X mark appears in the right mar...,"[-0.07267322, -0.013454423, 0.056170903, -0.00...",3822,51,"[4156, 4170, 4270, 4285, 4331, 4431, 4515, 456..."
38,210,[A handwritten X mark appears in the right mar...,"[-0.074868046, -0.007574868, 0.04765481, -0.00...",4515,48,"[4563, 4569, 4639, 4725, 4752, 4782, 4798, 480..."
38,26,"[A handwritten X appears in the right margin, ...","[-0.06757779, -0.010094775, 0.0584954, -0.0139...",4331,48,"[4431, 4515, 4563, 4569, 4639, 4725, 4752, 478..."
38,258,[A handwritten X mark appears in the right mar...,"[-0.07486805, -0.0075748595, 0.047654834, -0.0...",4563,47,"[4569, 4639, 4725, 4752, 4782, 4798, 4807, 594..."
38,264,[A handwritten X mark appears in the right mar...,"[-0.07486805, -0.0075748595, 0.047654834, -0.0...",4569,46,"[4639, 4725, 4752, 4782, 4798, 4807, 5940, 712..."


So we have encountered our very first problem - the most commonly repeated sentences are the editorial phrasing. Let's think about how to filter all this stuff out.

The second problem is that we still have a bit of a repetition issue. I don't want to delete things from the master list, so I suppose I should create a separate dataframe for it.

In [140]:
test_match = sentences[sentences.embeddings_id == 2631]

In [234]:
test_match['sent_str'].values

array(['I believe that armies, and navies are at the bottom the tinsel and braggadoa?cio of oppression and wrong; and I believe that the wicked conquest of weaker and darker nations by nations white and stronger but foreshadows the death of that stength.\n"'],
      dtype=object)

In [154]:
test_match['matches_indices'].index[0][0]

21

In [192]:
test_match.iloc[0]['matches_indices']

[3111,
 11748,
 12157,
 12380,
 13501,
 13586,
 13668,
 14263,
 16306,
 18413,
 19612,
 20464,
 21427,
 23259,
 24480,
 24713,
 25923,
 26171,
 27286,
 28396,
 29327,
 32399,
 32640,
 34258,
 34799,
 35091,
 35749,
 36198,
 40007,
 41940,
 43419,
 43492,
 44426,
 44562]

In [218]:
sentences.iloc[:,2]

doc_index  sent_num
0          0               0
           1               1
           2               2
           3               3
           4               4
                       ...  
356        128         45845
           129         45846
           130         45847
           131         45848
           132         45849
Name: embeddings_id, Length: 45850, dtype: int32

In [250]:
sorted_matches = sentences[sentences['embeddings_id'].isin(test_match.iloc[0]['matches_indices'])]
sorted_matches.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sent_str,embedding,embeddings_id,matches_counter,matches_indices
doc_index,sent_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
26,77,I believe that armies and navies are at the bo...,"[-0.047528114, 0.013157649, 0.0284595, -0.0099...",3111,33,"[11748, 12157, 12380, 13501, 13586, 13668, 142..."
81,54,I believe that armies and navies are at bottom...,"[-0.04752039, 0.012447078, 0.033345237, -0.008...",11748,32,"[12157, 12380, 13501, 13586, 13668, 14263, 163..."
84,110,"I believe that armies, and navies are at the b...","[-0.04838575, 0.012911433, 0.032891866, -0.007...",12157,31,"[12380, 13501, 13586, 13668, 14263, 16306, 184..."
84,333,"I believe that ’armies, and navies are at the ...","[-0.04690815, 0.01901435, 0.031043813, -0.0085...",12380,30,"[13501, 13586, 13668, 14263, 16306, 18413, 196..."
96,82,I believe that armies and navives are at botto...,"[-0.055808585, 0.02372884, 0.028631834, 0.0018...",13501,29,"[13586, 13668, 14263, 16306, 18413, 19612, 204..."


In [338]:
export = sorted_matches.join(documents, on = 'doc_index')\
    .drop(['embedding', 'embeddings_id', 'matches_counter', 'sent_str',
           'matches_indices', 'Document Body'], axis=1)\
    .sort_values('Year')

Let's export this to an Excel sheet to save it and use as a visualization!

In [342]:
export.to_excel('armies_and_navies_quote.xlsx', index=False)

OK, now to return to figuring out how to get rid of editorial language. Yay!

In [350]:
# this eliminates about 1000 sentences from the corpus, and is designed to remove editorial language that is bracketed.
# I don't want to remove it until after tokenization and comparisons though.
# but I think I will save this as a CSV

sentences_updated = sentences[~sentences['sent_str'].str.contains(r'\[.*?\]', regex=True, na=False)]

In [356]:
sentences_updated.sort_values(by = 'matches_counter', ascending = False).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,sent_str,embedding,embeddings_id,matches_counter,matches_indices
doc_index,sent_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
21,85,"I believe that armies, and navies are at the b...","[-0.041832954, 0.009992504, 0.019000486, 0.001...",2631,34,"[3111, 11748, 12157, 12380, 13501, 13586, 1366..."
26,77,I believe that armies and navies are at the bo...,"[-0.047528114, 0.013157649, 0.0284595, -0.0099...",3111,33,"[11748, 12157, 12380, 13501, 13586, 13668, 142..."
21,82,"I believe that all men, black and brown and wh...","[-0.034067627, -0.0008742066, 0.012146837, -0....",2628,33,"[11746, 12154, 12377, 13499, 13584, 13666, 142..."
81,52,"I believe that all men, black and brown and wh...","[-0.045373484, 0.004682678, 0.008682349, -0.02...",11746,33,"[12154, 12377, 13499, 13584, 13666, 14257, 163..."
84,107,"I believe that all men, black and brown and wh...","[-0.04605386, 0.0006658909, 0.007042777, -0.02...",12154,32,"[12377, 13499, 13584, 13666, 14257, 16300, 184..."
81,54,I believe that armies and navies are at bottom...,"[-0.04752039, 0.012447078, 0.033345237, -0.008...",11748,32,"[12157, 12380, 13501, 13586, 13668, 14263, 163..."
84,110,"I believe that armies, and navies are at the b...","[-0.04838575, 0.012911433, 0.032891866, -0.007...",12157,31,"[12380, 13501, 13586, 13668, 14263, 16306, 184..."
84,330,"I believe that all men, black and brown and wh...","[-0.04605386, 0.0006658909, 0.007042777, -0.02...",12377,31,"[13499, 13584, 13666, 14257, 16300, 18407, 196..."
84,333,"I believe that ’armies, and navies are at the ...","[-0.04690815, 0.01901435, 0.031043813, -0.0085...",12380,30,"[13501, 13586, 13668, 14263, 16306, 18413, 196..."
84,111,I believe in liberty for all men; the space to...,"[-0.048598252, 0.025800306, 0.012502862, -0.04...",12158,30,"[12381, 13502, 13587, 14264, 16307, 18414, 196..."


This is better. Now to figure out how to get rid of these duplicates.

Here's my idea: master list of match indices, filter sentences by that. So there's the sentences master list, and then there's also a secondary sentences list that only contains unique (non - near match sentences). And then we can sort by document index too, and see where there are entire paragraphs repeated.

In [380]:
seen_matches = set()

# shoutout ChatGPT for helping me out with this
# filtering method
def filter_sentences(df):
    global seen_matches
    to_keep = [] # initialize list which we'll fill with indices to keep

    for index, row in df.iterrows(): # iterate by row
        id = row['embeddings_id']
        match_list = row['matches_indices']
        match_counter = row['matches_counter']

        # we're gonna jump over these if the ID a) is already recorded as a match for another sentence or b) has no new matches
        if id in seen_matches:
            continue
        elif match_counter == 0:
            continue

        to_keep.append(index)
        seen_matches.update(match_list)

    return df.loc[to_keep]

In [374]:
filtered_sentences = filter_sentences(sentences_updated)

In [378]:
len(filtered_sentences)

8978

So we have really trimmed this stuff down. Obviously we still need to save the non-filtered sentences, but I'm feeling comfortable enough to call it on this notebook and move to a fresh one now that we kind of have all of our relational schema set up.

In [392]:
filtered_sentences = filtered_sentences.drop('embedding', axis = 1)
filtered_sentences.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sent_str,embeddings_id,matches_counter,matches_indices
doc_index,sent_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,"We need to discover who is, and who isn't viol...",1,5,"[2569, 2691, 3562, 11712, 40059]"
0,2,Violence is black children going to school for...,2,6,"[2570, 2692, 3563, 10633, 11713, 40060]"
0,3,Violence is 30 million hungry stomachs in the ...,3,6,"[2571, 2697, 3568, 10634, 11714, 40061]"
0,4,Violence is having black people represent a di...,4,2,"[2572, 11715]"
0,5,Violence is a country where properrty counts m...,5,6,"[2573, 2698, 3569, 10635, 11716, 40062]"


In [394]:
# let's export everything here to CSV

# documents table (full)
documents.to_csv('EMB_documents.csv')

# sentences table (full)
sentences.to_csv('EMB_sentences_0.csv')

# sentences table (no editorial language)
sentences_updated.to_csv('EMB_sentences_1.csv')

# sentences table (no repeated matches)
filtered_sentences.to_csv('EMB_sentences_2.csv')