In [1]:
# necessary packages in full (for now, still building of course)

import torch
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize
# nltk.download('punkt')

import re
import itertools

import os

import time # just for my own information

# this is just to get rid of an error message - something about how Windows caches the model I'm downloading from the internet
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [3]:
# instantiate global variables
model = SentenceTransformer("thenlper/gte-small")

regex_year_str = r'(19|20)\d{2}'

# read in data
documents = pd.read_csv('../document-catalog_extended.csv')
documents = documents[documents['Workflow Stage'] == 'Early Access']
documents.dropna(subset='Document Body', inplace=True)

In [5]:
documents.drop(['Box', 'Document Length', 'Workflow Stage', 'Image Filename', 'Image Identifier', 'Image URL'], axis=1, inplace=True)

In [7]:
documents.reset_index(inplace=True)
documents.drop('index', axis=1, inplace=True)

In [9]:
documents.head()

Unnamed: 0,ID,Title,Document Body
0,670,Undated Speech concerning Conditions of Black ...,[This speech includes pages with many differen...
1,667,Speeches making observations about the recent ...,"Now that the nation's voters — at least, 54% o..."
2,666,Speeches making observations about the recent ...,"Now that the nation's voters — at least, 54% o..."
3,665,Speeches making observations about the recent ...,"1\nNow that the nations voters — at least, 54%..."
4,663,Speech about the upcoming presidential electio...,The election approaching on November seventh i...


In [11]:
sentences = documents['Document Body']\
                .apply(lambda x: pd.Series([x for x in sent_tokenize(x) if len(x) >= 30], dtype='string'))\
                .stack()\
                .to_frame('sent_str')

In [13]:
documents.index.name = "index"
documents.head()

Unnamed: 0_level_0,ID,Title,Document Body
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,670,Undated Speech concerning Conditions of Black ...,[This speech includes pages with many differen...
1,667,Speeches making observations about the recent ...,"Now that the nation's voters — at least, 54% o..."
2,666,Speeches making observations about the recent ...,"Now that the nation's voters — at least, 54% o..."
3,665,Speeches making observations about the recent ...,"1\nNow that the nations voters — at least, 54%..."
4,663,Speech about the upcoming presidential electio...,The election approaching on November seventh i...


In [15]:
sentences.head()

Unnamed: 0,Unnamed: 1,sent_str
0,0,[This speech includes pages with many differen...
0,1,"We need to discover who is, and who isn't viol..."
0,2,Violence is black children going to school for...
0,3,Violence is 30 million hungry stomachs in the ...
0,4,Violence is having black people represent a di...


In [17]:
sentences = sentences.rename_axis(index = ['doc_index', 'sent_num'])

In [19]:
sentences.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sent_str
doc_index,sent_num,Unnamed: 2_level_1
0,0,[This speech includes pages with many differen...
0,1,"We need to discover who is, and who isn't viol..."
0,2,Violence is black children going to school for...
0,3,Violence is 30 million hungry stomachs in the ...
0,4,Violence is having black people represent a di...


In [21]:
print(type(sentences['sent_str']))

<class 'pandas.core.series.Series'>


In [27]:
start = time.time()
embeddings = model.encode(sentences['sent_str'].reset_index(drop=True))
end = time.time()
print(f'Runtime: {round(end-start, 3)} seconds')

Runtime: -385.645 seconds


Creating the embeddings seems to take a little over six minutes, which is not too bad but not necessarily scalable to the entire corpus. But for speeches, this is viable. When Lucian discussed this framework, he did say that it took a very long time to generate these embeddings. I definitely want to store these somehow so I don't have to re-do this step every time.

In [30]:
sentences['embedding'] = list(embeddings)

In [32]:
sentences.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sent_str,embedding
doc_index,sent_num,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,[This speech includes pages with many differen...,"[-0.062395636, 0.0135172205, 0.045818355, -0.0..."
0,1,"We need to discover who is, and who isn't viol...","[-0.020895261, -0.008539446, 0.029561546, -0.0..."
0,2,Violence is black children going to school for...,"[0.00194369, -0.006336346, 0.035046395, -0.005..."
0,3,Violence is 30 million hungry stomachs in the ...,"[0.0031232794, -0.011578105, 0.041491807, -0.0..."
0,4,Violence is having black people represent a di...,"[-0.0017653363, 0.026803194, 0.014218208, -0.0..."


In [43]:
len(embeddings)

45850

In [45]:
len(sentences.sent_str)

45850

In [51]:
embeddings_id = np.arange(len(embeddings))
sentences['embeddings_id'] = embeddings_id
sentences.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sent_str,embedding,embeddings_id
doc_index,sent_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,[This speech includes pages with many differen...,"[-0.062395636, 0.0135172205, 0.045818355, -0.0...",0
0,1,"We need to discover who is, and who isn't viol...","[-0.020895261, -0.008539446, 0.029561546, -0.0...",1
0,2,Violence is black children going to school for...,"[0.00194369, -0.006336346, 0.035046395, -0.005...",2
0,3,Violence is 30 million hungry stomachs in the ...,"[0.0031232794, -0.011578105, 0.041491807, -0.0...",3
0,4,Violence is having black people represent a di...,"[-0.0017653363, 0.026803194, 0.014218208, -0.0...",4


I wonder if this might pose an issue for storage. Like, should I store the embeddings separately and just store the embedding index in the sentences dataframe? I think that's the best practice. We'll figure that one out.

In [35]:
start = time.time()
similarities = model.similarity(embeddings, embeddings)
end = time.time()
print(f'Runtime: {round(end-start, 3)} seconds.')

Runtime: 10.838 seconds.


In [79]:
indices = torch.nonzero(similarities >= 0.95)
indices = indices.numpy()

In [81]:
len(indices)

197380

In [83]:
indices[indices[:,0] == 1]

array([[    1,     1],
       [    1,  2569],
       [    1,  2691],
       [    1,  3562],
       [    1, 11712],
       [    1, 40059]], dtype=int64)

In [85]:
indices[indices[:,1] == 1]

array([[    1,     1],
       [ 2569,     1],
       [ 2691,     1],
       [ 3562,     1],
       [11712,     1],
       [40059,     1]], dtype=int64)

In [100]:
counter = []
for i in indices[indices[:,0] == 1]:
    counter.append(i[1])
counter

[2569, 2691, 3562, 11712, 40059]

In [87]:
# so, I think I want to filter the indices so at the very least the comparisons between the same sentence are eliminated
# I think I also want to filter out duplicate pairs, but less sure about that
# but we'll write the code for it
indices = indices[indices[:, 0] != indices[:, 1]]
indices = indices[indices[:, 0] < indices[:, 1]]

In [89]:
len(indices)

75765

Excellent! That eliminated a lot of duplicates and should make this next part easy. This is also just looking at Near Match language - paraphrasing will expand this a lot, I'm sure.

In [91]:
scores = similarities[indices[:, 0], indices[:, 1]].tolist()

### Next Step: finding common sentences across the corpus

Also, need to pull out examples that I can showcase. The final step will be doing a join between the sentences table and the documents table. This is not exactly third NF, so I don't really even know what to call it.

In [None]:
for i in sentences.embeddings_id:
    counter = 0
    sims_list = []
    temp_ind = indices[indices[:,0] == i]
    for j in temp_ind:
        counter += 1
        sims_list.append(j[1])
    