In [8]:
import pandas as pd
from wc_simd.dedupe_service import dedup_data_file
dedup_data = pd.read_csv("../" + dedup_data_file, index_col=0)

# force all columns to string type
dedup_data = dedup_data.astype(str)

dedup_data

Unnamed: 0_level_0,label,id,type,source
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Children,null00000,missing,subject
1,Criminal law,null00001,missing,subject
2,Gender roles,null00002,missing,subject
3,Law,null00003,missing,subject
4,Police,null00004,missing,subject
...,...,...,...,...
646781,Pregnant teenagers - Great Britain - Diaries,zzzvtr3f,Concept,subject
646782,"Maddox, Lucy",zzzw65be,Person,contributor
646783,"Rockliffe, Richard William.",zzzwgbkn,Person,contributor
646784,Aneurysm - drug therapy,zzzygqw9,Concept,subject


# Use Spacy to extract Person Entities and Dates

In [4]:
import spacy
import pandas as pd
from collections import defaultdict
import re
from tqdm import tqdm

# Ensure curated transformers factory is registered, then attempt to load transformer model
nlp = spacy.load("en_core_web_trf")
print("Loaded transformer model: en_core_web_trf")


# Optional: disable components not needed for speed (keep ner, transformer, tokenizer)
# nlp.disable_pipes(*[p for p in nlp.pipe_names if p not in {"transformer", "ner"}])

def extract_entities_and_dates(doc):
    """Extract person entities and dates from a processed spaCy Doc."""
    persons = []
    dates = []
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            persons.append(ent.text.strip())
        elif ent.label_ == "DATE":
            dates.append(ent.text.strip())
    return persons, dates

# Additional regex function for dates
DATE_PATTERNS = [
    re.compile(r"\b(\d{4})\s*-\s*(\d{4})\b"),  # 1820-1910
    re.compile(r"\b[bd]\.\s*(\d{4})\b"),        # b. 1820 / d. 1910
    re.compile(r"\b(\d{4})\b"),                  # single year
]

def augment_dates(text, dates):
    found = []
    for pat in DATE_PATTERNS:
        matches = pat.findall(text)
        if matches:
            if isinstance(matches[0], tuple):
                for tup in matches:
                    found.extend([m for m in tup if m])
            else:
                found.extend(matches)
    # merge
    return list({*dates, *found})

# Filter for Person and Agent only (retain prior 1000 sample for speed)
person_agent_data = dedup_data[dedup_data['type'].isin(['Person', 'Agent'])].copy().sample(1000, random_state=42)

print("Original data shape:", dedup_data.shape)
print("Filtered data shape (Person/Agent):", person_agent_data.shape)
print("Type distribution:\n", person_agent_data['type'].value_counts())

texts = person_agent_data['label'].tolist()
index_info = person_agent_data.index.tolist()

print("\nProcessing with model (batched)...")
extraction_results = []

# Use nlp.pipe for efficient batching
for doc, orig_idx in tqdm(zip(nlp.pipe(texts, batch_size=64), index_info), total=len(texts), desc="NER batches"):
    persons, dates = extract_entities_and_dates(doc)
    # dates = augment_dates(doc.text, dates)
    row = person_agent_data.loc[orig_idx]
    result = {
        'original_idx': orig_idx,
        'label': row['label'],
        'persons': list(set(persons)),
        'dates': list(set(dates)),
        'id': row.get('id'),
        'type': row.get('type')
    }
    if 'idx' in row:
        result['idx'] = row['idx']
    extraction_results.append(result)

entity_df = pd.DataFrame(extraction_results)

# Stats
total = len(entity_df)
with_person = (entity_df['persons'].apply(len) > 0).sum()
with_date = (entity_df['dates'].apply(len) > 0).sum()
print(f"\nTotal processed: {total}")
print(f"Records w/ PERSON: {with_person} ({with_person/total*100:.2f}%)")
print(f"Records w/ DATE: {with_date} ({with_date/total*100:.2f}%)")

print("\nSample extractions:")
entity_df.head(100)

Loaded transformer model: en_core_web_trf
Original data shape: (646786, 4)
Filtered data shape (Person/Agent): (1000, 4)
Type distribution:
 type
Person    968
Agent      32
Name: count, dtype: int64

Processing with model (batched)...


NER batches: 100%|██████████| 1000/1000 [00:04<00:00, 213.29it/s]


Total processed: 1000
Records w/ PERSON: 896 (89.60%)
Records w/ DATE: 378 (37.80%)

Sample extractions:





Unnamed: 0,original_idx,label,persons,dates,id,type
0,256490,"Rigdon, R. H.",[Rigdon],[],k5gfn3uv,Person
1,528535,"Ebbell, B. (Bendix), 1865-",[],[],vu85phja,Person
2,475547,"Altini, I.",[Altini],[],txhr3thf,Person
3,494330,"Clauder, Gabriel Zaccharias.","[Gabriel Zaccharias, Clauder]",[],um899tjd,Person
4,171855,"Hayes, Jane E.","[Hayes, Jane E.]",[],g4n48qae,Person
...,...,...,...,...,...,...
95,600838,"Dolbeau, Henri Ferdinand, 1830-1877.","[Dolbeau, Henri Ferdinand]",[1830-1877],ydg3gbam,Person
96,214340,"Beattie, William, 1793-1875","[William, Beattie]",[1793-1875],hmuzweka,Person
97,518001,"Bradwell, W.",[],[],vfjn7ge9,Person
98,125349,"Black, Lois Fischer.",[Lois Fischer],[],ef4y3quh,Person


In [6]:
# Analyze the extracted entities in more detail

print("=== ENTITY EXTRACTION ANALYSIS ===\n")

# Show records with person entities
print("Examples of records with person entities:")
person_records = entity_df[entity_df['persons'].apply(len) > 0]
for i, row in person_records.head(10).iterrows():
    print(f"Label: {row['label']}")
    print(f"Persons: {row['persons']}")
    print(f"Dates: {row['dates']}")
    print("-" * 50)

print("\nExamples of records with date entities:")
date_records = entity_df[entity_df['dates'].apply(len) > 0]
for i, row in date_records.head(10).iterrows():
    print(f"Label: {row['label']}")
    print(f"Persons: {row['persons']}")
    print(f"Dates: {row['dates']}")
    print("-" * 50)

# Most common person entities
print("\nMost common person entities:")
all_persons = []
for persons_list in entity_df['persons']:
    all_persons.extend(persons_list)

person_counts = pd.Series(all_persons).value_counts()
print(person_counts.head(20))

# Most common dates
print("\nMost common dates:")
all_dates = []
for dates_list in entity_df['dates']:
    all_dates.extend(dates_list)

date_counts = pd.Series(all_dates).value_counts()
print(date_counts.head(20))

=== ENTITY EXTRACTION ANALYSIS ===

Examples of records with person entities:
Label: Nelson, Geoffrey B. (Geoffrey Brian)
Persons: ['Geoffrey B.', 'Geoffrey Brian', 'Nelson']
Dates: []
--------------------------------------------------
Label: Jones, John E
Persons: ['John E']
Dates: []
--------------------------------------------------
Label: Jones, John E.
Persons: ['John E.']
Dates: []
--------------------------------------------------
Label: Zimmermann, Wilhelm.
Persons: ['Wilhelm']
Dates: []
--------------------------------------------------
Label: Orel, Harold, 1926-2017.
Persons: ['Harold']
Dates: ['1926', '1926-2017', '2017']
--------------------------------------------------
Label: Brown, Bruce, 1942-
Persons: ['Bruce']
Dates: ['1942']
--------------------------------------------------
Label: Shugar, Andrea.
Persons: ['Andrea']
Dates: []
--------------------------------------------------
Label: Bruce (Edinburgh, Scotland)
Persons: ['Bruce']
Dates: []
---------------------------

In [7]:
# Analyze entity extraction results by record type
print("=== ENTITY EXTRACTION BY TYPE ===\n")

# Group by type and analyze
type_analysis = entity_df.groupby('type').agg({
    'persons': lambda x: (x.apply(len) > 0).sum(),  # Count records with persons
    'dates': lambda x: (x.apply(len) > 0).sum(),    # Count records with dates
    'label': 'count'                                # Total count
}).rename(columns={'label': 'total_records'})

type_analysis['person_percentage'] = (type_analysis['persons'] / type_analysis['total_records'] * 100).round(2)
type_analysis['date_percentage'] = (type_analysis['dates'] / type_analysis['total_records'] * 100).round(2)

print("Entity extraction statistics by record type:")
print(type_analysis)

print("\n=== EXAMPLES BY TYPE ===")

# Show examples for Person and Agent types
for record_type in ['Person', 'Agent']:
    if record_type in entity_df['type'].values:
        print(f"\n--- {record_type.upper()} TYPE EXAMPLES ---")
        type_records = entity_df[entity_df['type'] == record_type]
        
        # Show records with both persons and dates
        both_entities = type_records[
            (type_records['persons'].apply(len) > 0) & 
            (type_records['dates'].apply(len) > 0)
        ]
        
        print(f"Records with both person entities and dates: {len(both_entities)}")
        for i, row in both_entities.head(5).iterrows():
            print(f"  Label: {row['label']}")
            print(f"  Persons: {row['persons']}")
            print(f"  Dates: {row['dates']}")
            print()

# Save the entity_df for later use
print("Entity extraction DataFrame 'entity_df' is ready for further analysis.")

=== ENTITY EXTRACTION BY TYPE ===

Entity extraction statistics by record type:
              persons   dates  total_records  person_percentage  \
type                                                              
Agent            8547    6866          12116              70.54   
Concept          8429   10330         142820               5.90   
GenreConcept       63       7           1223               5.15   
Meeting          1095    6169           6431              17.03   
Organisation    12602    2558          66913              18.83   
Period              6     169            198               3.03   
Person         297136  167829         402605              73.80   
Place            2166    1726          14472              14.97   
missing             0       0              8               0.00   

              date_percentage  
type                           
Agent                   56.67  
Concept                  7.23  
GenreConcept             0.57  
Meeting               

# Embed dataset

In [None]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("pyspark_alto_text_chunking")
    # .master("local[4]")
    .config("spark.driver.memory", "32g")
    .config("spark.executor.memory", "32g")
    .config("spark.sql.orc.enableVectorizedReader", "false")
    .config("spark.sql.parquet.columnarReaderBatchSize", "256")
    .config("spark.sql.orc.columnarReaderBatchSize", "256")
    .config("spark.sql.shuffle.partitions", "1024")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/13 13:54:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# Create Spark DataFrame with specified columns from dedup_data
from pyspark.sql.functions import col
import pyspark.sql.functions as F

# Create Spark DataFrame from pandas DataFrame
spark_df = spark.createDataFrame(dedup_data.reset_index())

# Create the DataFrame with the specified columns
# Using 'label' as chunk_text, 'idx' as chunk_index, and adding id and total_chunks
from pyspark.sql.window import Window

# First create a window specification to group by id
window_spec = Window.partitionBy("id")

name_rec_for_embedding_df = spark_df.select(
    col("id").alias("id"),  # Using id as id
    col("label").alias("chunk_text"),  # Using label as chunk_text
    col("idx").alias("chunk_index"),  # Using idx as chunk_index
    F.count("*").over(window_spec).alias("total_chunks")  # Count of records grouped by id
)

# Show the structure and some sample data
name_rec_for_embedding_df.printSchema()
name_rec_for_embedding_df.show(10, truncate=False)

root
 |-- id: string (nullable = true)
 |-- chunk_text: string (nullable = true)
 |-- chunk_index: long (nullable = true)
 |-- total_chunks: long (nullable = false)



                                                                                

+--------+-------------------------------------------+-----------+------------+
|id      |chunk_text                                 |chunk_index|total_chunks|
+--------+-------------------------------------------+-----------+------------+
|a224b9mp|Jones, John E                              |11         |2           |
|a224b9mp|Jones, John E.                             |12         |2           |
|a22hh3y8|Cheeseman, Thomas, 1760-approximately 1835.|22         |1           |
|a22jzkjd|Prujean, Thomas, 1622 or 1623-1662.        |24         |1           |
|a22ncx9p|Ideler, Julius Ludwig, 1809-1842           |27         |1           |
|a22xqhjs|Hernia, Inguinal, therapy                  |31         |1           |
|a22xr2nw|Poor families                              |32         |1           |
|a22xt94e|Scrimshaw, Jane, 1584-1711.                |33         |1           |
|a232zjf3|Foster, William                            |34         |3           |
|a232zjf3|Foster, William.              

In [6]:
name_rec_for_embedding_df.write.saveAsTable("name_rec_for_embedding")

                                                                                

In [8]:
name_rec_for_embedding_df = spark.table("name_rec_for_embedding")

# Embedding

See: `scripts/run_name_rec_embed.sh`

In [64]:
# Uncomment delete tables to restart embedding
for df_idx in range(0, 8):
    table_name = f"name_rec_embeddings_{df_idx}"
    result = spark.sql(f"""DROP TABLE IF EXISTS {table_name}""").collect()
    print(f"Dropped table {table_name}: {result}")

Dropped table name_rec_embeddings_0: []
Dropped table name_rec_embeddings_1: []
Dropped table name_rec_embeddings_2: []
Dropped table name_rec_embeddings_3: []
Dropped table name_rec_embeddings_4: []
Dropped table name_rec_embeddings_5: []
Dropped table name_rec_embeddings_6: []
Dropped table name_rec_embeddings_7: []


## Debug GPU emedding difference for F.N.

In [76]:
ng_rows = dedup_data[dedup_data['label'].str.contains(
    "florence", case=False, na=False) & dedup_data['label'].str.contains(
    "night", case=False, na=False)][3:]
ng_rows

Unnamed: 0_level_0,label,id,type,source
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
87519,"Nightingale, Florence, 1820-1910. Notes on nur...",d4rsp2n3,Agent,contributor
87900,"Florence Nightingale Museum (London, England)",d56fkgss,Organisation,subject
91691,École Florence Nightingale de Bordeaux,d9d45nkh,Organisation,contributor
98931,National Florence Nightingale Memorial Committ...,dhayqztj,Organisation,contributor
136957,"Nightingale, Florence (1820-1910)",euzka9gm,Agent,contributor
136958,"Nightingale, Florence (1820-1910)",euzka9gm,Organisation,subject
160627,"Florence Nightingale Hospital (London, England)",fqb5m3hw,Organisation,subject
160628,"Florence Nightingale Hospital (London, England)",fqb5m3hw,Organisation,contributor
184753,"Nightingale, Florence, 1820-1910",gk2eca5r,Person,subject
184754,"Nightingale, Florence, 1820-1910",gk2eca5r,Person,contributor


In [96]:
from wc_simd.embed import Qwen3Embedding

embeddings_model = Qwen3Embedding(
    endpoint="http://ec2-3-231-68-18.compute-1.amazonaws.com:8080/embed")

instruction = "Given a search query with a person's name, retrieve relevant passages that has the person mentioned."

# Get embeddings for the batch
embedding_vectors = embeddings_model.get_embeddings(
    ng_rows["label"][:32], is_query=False, instruction=instruction)

In [97]:
ng_rows["embedding"] = embedding_vectors

In [98]:
ng_rows

Unnamed: 0_level_0,label,id,type,source,embedding
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
87519,"Nightingale, Florence, 1820-1910. Notes on nur...",d4rsp2n3,Agent,contributor,"[-0.017157698, 0.0016508125, -0.01417414, -0.0..."
87900,"Florence Nightingale Museum (London, England)",d56fkgss,Organisation,subject,"[0.0020602965, 0.006439898, -0.0057806033, -0...."
91691,École Florence Nightingale de Bordeaux,d9d45nkh,Organisation,contributor,"[0.016392738, -0.037444912, -0.009886473, 0.04..."
98931,National Florence Nightingale Memorial Committ...,dhayqztj,Organisation,contributor,"[0.029678939, 0.023072772, -0.008009571, -0.03..."
136957,"Nightingale, Florence (1820-1910)",euzka9gm,Agent,contributor,"[-0.029406318, 0.081306584, -0.0012775987, 0.0..."
136958,"Nightingale, Florence (1820-1910)",euzka9gm,Organisation,subject,"[-0.029385742, 0.08017875, -0.0012722877, 0.00..."
160627,"Florence Nightingale Hospital (London, England)",fqb5m3hw,Organisation,subject,"[0.00024022482, 0.018366719, -0.00536159, -0.0..."
160628,"Florence Nightingale Hospital (London, England)",fqb5m3hw,Organisation,contributor,"[0.00029090588, 0.018840186, -0.0053290585, -0..."
184753,"Nightingale, Florence, 1820-1910",gk2eca5r,Person,subject,"[-0.03361421, -0.027411293, -0.005647615, -0.0..."
184754,"Nightingale, Florence, 1820-1910",gk2eca5r,Person,contributor,"[-0.033514343, -0.027920393, -0.005565158, -0...."


In [99]:
from sklearn.metrics.pairwise import cosine_similarity
import ast
import numpy as np

# Convert string representations of arrays to actual numpy arrays
def parse_embedding(embedding):
    try:
        # Try to parse as a list first
        if isinstance(embedding, str):
            return np.array(ast.literal_eval(embedding))
        else:
            return np.array(embedding)
    except:
        # If parsing fails, assume it's already a numpy array or list
        return np.array(embedding)


embeddings = np.vstack([parse_embedding(emb) for emb in ng_rows["embedding"]])
print(f"Embeddings shape: {embeddings.shape}")

cosine_sim_matrix = cosine_similarity(embeddings)
print(f"Cosine similarity matrix shape: {cosine_sim_matrix.shape}")
cosine_sim_matrix

Embeddings shape: (32, 1024)
Cosine similarity matrix shape: (32, 32)


array([[1.        , 0.29939434, 0.46121056, ..., 0.32919915, 0.46635306,
        0.53332405],
       [0.29939434, 1.        , 0.40520677, ..., 0.55595252, 0.5387129 ,
        0.53216024],
       [0.46121056, 0.40520677, 1.        , ..., 0.48274531, 0.49147017,
        0.45739367],
       ...,
       [0.32919915, 0.55595252, 0.48274531, ..., 1.        , 0.38668774,
        0.4015409 ],
       [0.46635306, 0.5387129 , 0.49147017, ..., 0.38668774, 1.        ,
        0.55390456],
       [0.53332405, 0.53216024, 0.45739367, ..., 0.4015409 , 0.55390456,
        1.        ]])

In [100]:
import pandas as pd

# Create a DataFrame that shows similarity between 2 labels
similarity_df = pd.DataFrame(
    cosine_sim_matrix,
    columns=ng_rows['label'].values,
    index=ng_rows['label'].values)
similarity_df

Unnamed: 0,"Nightingale, Florence, 1820-1910. Notes on nursing. Dutch.","Florence Nightingale Museum (London, England)",École Florence Nightingale de Bordeaux,National Florence Nightingale Memorial Committee of Great Britain and Northern Ireland. Dan Mason Nursing Research Committee,"Nightingale, Florence (1820-1910)","Nightingale, Florence (1820-1910).1","Florence Nightingale Hospital (London, England)","Florence Nightingale Hospital (London, England).1","Nightingale, Florence, 1820-1910","Nightingale, Florence, 1820-1910.1",...,Florence Nightingale International Foundation,Florence Nightingale International Foundation.,Florence Nightingale Museum,Florence Nightingale Museum.1,Florence Nightingale Museum.,Florence Nightingale Foundation (1934-),Florence Nightingale School of Nursing and Midwifery,Florence Nightingale International Foundation.1,FLORENCE NIGHTINGALE,"Nightingale, Florence, (former owner), 1820-1910."
"Nightingale, Florence, 1820-1910. Notes on nursing. Dutch.",1.0,0.299394,0.461211,0.501166,0.354736,0.356022,0.336255,0.335947,0.515018,0.515218,...,0.329199,0.270393,0.423367,0.423367,0.453109,0.493141,0.580352,0.329199,0.466353,0.533324
"Florence Nightingale Museum (London, England)",0.299394,1.0,0.405207,0.583109,0.354981,0.355222,0.835939,0.836827,0.368417,0.368358,...,0.555953,0.574445,0.469647,0.469647,0.594685,0.618386,0.568544,0.555953,0.538713,0.53216
École Florence Nightingale de Bordeaux,0.461211,0.405207,1.0,0.470691,0.335089,0.335743,0.47994,0.479285,0.497699,0.496923,...,0.482745,0.408447,0.519192,0.519192,0.54178,0.548379,0.556267,0.482745,0.49147,0.457394
National Florence Nightingale Memorial Committee of Great Britain and Northern Ireland. Dan Mason Nursing Research Committee,0.501166,0.583109,0.470691,1.0,0.451522,0.451797,0.585232,0.584612,0.443564,0.443677,...,0.561311,0.544117,0.4414,0.4414,0.512256,0.716284,0.627506,0.561311,0.471644,0.58897
"Nightingale, Florence (1820-1910)",0.354736,0.354981,0.335089,0.451522,1.0,0.999969,0.348016,0.347438,0.543553,0.543648,...,0.317541,0.3166,0.302966,0.302966,0.398599,0.479294,0.448854,0.317541,0.387252,0.546627
"Nightingale, Florence (1820-1910)",0.356022,0.355222,0.335743,0.451797,0.999969,1.0,0.348335,0.347758,0.545175,0.54527,...,0.317624,0.316555,0.304037,0.304037,0.399175,0.47974,0.449381,0.317624,0.387903,0.547147
"Florence Nightingale Hospital (London, England)",0.336255,0.835939,0.47994,0.585232,0.348016,0.348335,1.0,0.999972,0.415304,0.415097,...,0.556487,0.572789,0.471343,0.471343,0.617661,0.676102,0.608711,0.556487,0.595556,0.55801
"Florence Nightingale Hospital (London, England)",0.335947,0.836827,0.479285,0.584612,0.347438,0.347758,0.999972,1.0,0.415152,0.41494,...,0.556127,0.572159,0.471023,0.471023,0.617005,0.675552,0.608104,0.556127,0.595424,0.558058
"Nightingale, Florence, 1820-1910",0.515018,0.368417,0.497699,0.443564,0.543553,0.545175,0.415304,0.415152,1.0,0.999972,...,0.293557,0.306915,0.388291,0.388291,0.475951,0.523677,0.553616,0.293557,0.616393,0.608255
"Nightingale, Florence, 1820-1910",0.515218,0.368358,0.496923,0.443677,0.543648,0.54527,0.415097,0.41494,0.999972,1.0,...,0.293332,0.306862,0.387884,0.387884,0.475919,0.523257,0.553693,0.293332,0.616488,0.608412


# Post Embedding

## GPU embeds

In [49]:
name_rec_embeddings_tables_df = spark.sql("show tables").where(
    "tableName RLIKE 'name_rec_embeddings_[0-9]+$'").toPandas()
name_rec_embeddings_tables_df

Unnamed: 0,namespace,tableName,isTemporary
0,default,name_rec_embeddings_0,False
1,default,name_rec_embeddings_1,False
2,default,name_rec_embeddings_2,False
3,default,name_rec_embeddings_3,False
4,default,name_rec_embeddings_4,False
5,default,name_rec_embeddings_5,False
6,default,name_rec_embeddings_6,False
7,default,name_rec_embeddings_7,False


In [None]:
dedup_data.shape

(646786, 4)

In [57]:

name_rec_embeddings_df = None

for table in name_rec_embeddings_tables_df["tableName"]:
    print(f"Processing table: {table}")
    df = spark.table(table)

    if name_rec_embeddings_df is None:
        name_rec_embeddings_df = df
    else:
        name_rec_embeddings_df = name_rec_embeddings_df.union(df)

Processing table: name_rec_embeddings_0
Processing table: name_rec_embeddings_1
Processing table: name_rec_embeddings_2
Processing table: name_rec_embeddings_3
Processing table: name_rec_embeddings_4
Processing table: name_rec_embeddings_5
Processing table: name_rec_embeddings_6
Processing table: name_rec_embeddings_7


In [22]:
assert(name_rec_embeddings_df.count() == dedup_data.shape[0])

In [58]:
name_rec_embeddings_df.show()

+--------+--------------------+-----------+------------+--------------------+
|      id|          chunk_text|chunk_index|total_chunks|           embedding|
+--------+--------------------+-----------+------------+--------------------+
|wd9jjfvy|     Animal products|     544024|           2|[-0.038852077, 0....|
|ja4pa5ms|      Watson, David.|     232476|           1|[-0.004940854, 0....|
|urjn2gpa|      Medicine, Rome|     498345|           1|[-0.022078881, 0....|
|fupqdhft|McGill University...|     164605|           2|[-0.028227717, -0...|
|pzuwx5vu|           Tamoxifen|     365383|           1|[-0.047270138, -2...|
|nmadupcc|  Chun, Malcolm Nāea|     325738|           1|[0.0093300855, -0...|
|pm9bf7ds| March, C. G., 1941-|     353821|           1|[0.012294825, 0.0...|
|exmuh6yb|Witness bearing (...|     139394|           1|[-3.9720256E-4, 0...|
|cbt5xur8|Felgenhauer, Paul...|      65497|           1|[-0.011304881, 0....|
|xwwsebfd| Krackow, Kenneth A.|     587305|           1|[0.01581

In [59]:
# Find rows with chunk_text containing "Florence" and "Night"
from pyspark.sql.functions import col, lower

# Filter rows where chunk_text contains both "Florence" and "Night" (case-insensitive)
florence_night_df = name_rec_embeddings_df.filter(
    lower(col("chunk_text")).contains("florence") & 
    lower(col("chunk_text")).contains("night")
)

# Show the results
print(f"Found {florence_night_df.count()} rows containing both 'Florence' and 'Night'")
florence_night_df.select(
    "id",
    "chunk_text",
    "chunk_index").show(
        truncate=False)

Found 35 rows containing both 'Florence' and 'Night'
+--------+----------------------------------------------------------------------------------------------------------------------------+-----------+
|id      |chunk_text                                                                                                                  |chunk_index|
+--------+----------------------------------------------------------------------------------------------------------------------------+-----------+
|gk2eca5r|Nightingale, Florence, 1820-1910 Homes and haunts.                                                                          |184755     |
|gk2eca5r|Nightingale, Florence, 1820-1910.                                                                                           |184756     |
|qk29p79b|Florence Nightingale Hospital for Gentlewomen (London, England)                                                             |381010     |
|d9d45nkh|École Florence Nightingale de Bordeaux           

In [60]:
import numpy as np

florence_night_df_pd = florence_night_df.toPandas()
florence_night_df_pd

                                                                                

Unnamed: 0,id,chunk_text,chunk_index,total_chunks,embedding
0,gk2eca5r,"Nightingale, Florence, 1820-1910 Homes and hau...",184755,5,"[-0.0027343807741999626, 0.12512944638729095, ..."
1,gk2eca5r,"Nightingale, Florence, 1820-1910.",184756,5,"[0.015180406160652637, 0.03359244763851166, -0..."
2,qk29p79b,Florence Nightingale Hospital for Gentlewomen ...,381010,2,"[0.02689625322818756, 0.12099285423755646, -0...."
3,d9d45nkh,École Florence Nightingale de Bordeaux,91691,1,"[0.009793026372790337, -0.013638803735375404, ..."
4,x8pvu9wy,FLORENCE NIGHTINGALE,567908,1,"[0.07302630692720413, 0.037123311311006546, -0..."
5,qk29p79b,Florence Nightingale Hospital for Gentlewomen ...,381011,2,"[0.02677125483751297, 0.120817169547081, -0.00..."
6,gk2eca5r,"Nightingale, Florence, 1820-1910.",184757,5,"[0.015016713179647923, 0.033464398235082626, -..."
7,gk2eca5r,"Nightingale, Florence, 1820-1910",184754,5,"[0.015207616612315178, 0.03440776839852333, -0..."
8,euzka9gm,"Nightingale, Florence (1820-1910)",136957,2,"[-0.0021251621656119823, 0.07694815844297409, ..."
9,gr7m4ere,Florence Nightingale Commemoration Committee.,189427,1,"[-0.022971369326114655, -0.027769610285758972,..."


In [61]:
# Check the type and format of embeddings
print("Embedding column info:")
print(f"Type: {type(florence_night_df_pd['embedding'].iloc[0])}")
print(f"Sample embedding: {florence_night_df_pd['embedding'].iloc[0][:100]}...")  # First 100 chars
print(f"Number of rows: {len(florence_night_df_pd)}")

Embedding column info:
Type: <class 'list'>
Sample embedding: [-0.0027343807741999626, 0.12512944638729095, -0.010198501870036125, 0.057417649775743484, -0.06350371241569519, -0.010407166555523872, -0.11817394196987152, -0.03131713718175888, -0.049140606075525284, 0.041767776012420654, -0.030621588230133057, -0.036481596529483795, 0.08562219887971878, -0.00604693777859211, -0.04009845480322838, 0.081588014960289, -0.03199530020356178, 0.05077514797449112, 0.0023213981185108423, 0.00620343629270792, -0.034690555185079575, 0.009815949015319347, 0.07261541485786438, 0.03867257758975029, -0.019684063270688057, 0.003129974938929081, 0.01744960993528366, 0.014015331864356995, 0.0058165364898741245, -0.0061860475689172745, 0.009928976185619831, 0.046184517443180084, -0.0037733586505055428, -0.029178321361541748, -0.014458744786679745, -0.004153737332671881, 0.018501630052924156, 0.05338345840573311, 0.011320075951516628, 0.046184517443180084, -0.007968394085764885, -0.003968982025980949, -0.0

In [62]:
from sklearn.metrics.pairwise import cosine_similarity
import ast

# Convert string representations of arrays to actual numpy arrays
def parse_embedding(embedding):
    try:
        # Try to parse as a list first
        if isinstance(embedding, str):
            return np.array(ast.literal_eval(embedding))
        else:
            return np.array(embedding)
    except:
        # If parsing fails, assume it's already a numpy array or list
        return np.array(embedding)

embeddings = np.vstack([parse_embedding(emb) for emb in florence_night_df_pd["embedding"]])
print(f"Embeddings shape: {embeddings.shape}")

cosine_sim_matrix = cosine_similarity(embeddings)
print(f"Cosine similarity matrix shape: {cosine_sim_matrix.shape}")
cosine_sim_matrix

Embeddings shape: (35, 1024)
Cosine similarity matrix shape: (35, 35)


array([[1.        , 0.755241  , 0.72153407, ..., 0.48712544, 0.47871777,
        0.58824446],
       [0.755241  , 1.        , 0.64019535, ..., 0.48507645, 0.51425339,
        0.62992338],
       [0.72153407, 0.64019535, 1.        , ..., 0.51724851, 0.56311921,
        0.69393391],
       ...,
       [0.48712544, 0.48507645, 0.51724851, ..., 1.        , 0.84690574,
        0.59366733],
       [0.47871777, 0.51425339, 0.56311921, ..., 0.84690574, 1.        ,
        0.63754622],
       [0.58824446, 0.62992338, 0.69393391, ..., 0.59366733, 0.63754622,
        1.        ]])

In [63]:
import pandas as pd

# Create a DataFrame that shows similarity between 2 labels
similarity_df = pd.DataFrame(
    cosine_sim_matrix,
    columns=florence_night_df_pd['chunk_text'].values,
    index=florence_night_df_pd['chunk_text'].values)
similarity_df

Unnamed: 0,"Nightingale, Florence, 1820-1910 Homes and haunts.","Nightingale, Florence, 1820-1910.","Florence Nightingale Hospital for Gentlewomen (London, England)",École Florence Nightingale de Bordeaux,FLORENCE NIGHTINGALE,"Florence Nightingale Hospital for Gentlewomen (London, England).1","Nightingale, Florence, 1820-1910..1","Nightingale, Florence, 1820-1910","Nightingale, Florence (1820-1910)",Florence Nightingale Commemoration Committee.,...,"Florence Nightingale Hospital (London, England)","Florence Nightingale Hospital (London, England).1",Florence Nightingale International Foundation,"Nightingale, Florence, 1820-1910. Notes on nursing.","Nightingale, Florence, 1820-1910..2",Florence Nightingale International Foundation.,Florence Nightingale Museum,Florence Nightingale Museum.,Florence Nightingale Museum.1,Florence Nightingale School of Nursing and Midwifery
"Nightingale, Florence, 1820-1910 Homes and haunts.",1.0,0.755241,0.721534,0.543137,0.585522,0.721362,0.75523,0.698889,0.667694,0.565393,...,0.681779,0.681779,0.666528,0.70654,0.754986,0.588776,0.479296,0.487125,0.478718,0.588244
"Nightingale, Florence, 1820-1910.",0.755241,1.0,0.640195,0.624907,0.614692,0.640156,0.999988,0.909452,0.866995,0.626476,...,0.643343,0.643343,0.653531,0.679596,0.999984,0.583067,0.51493,0.485076,0.514253,0.629923
"Florence Nightingale Hospital for Gentlewomen (London, England)",0.721534,0.640195,1.0,0.578257,0.608591,0.999984,0.639703,0.598763,0.585192,0.615287,...,0.886353,0.886353,0.715749,0.65583,0.639671,0.594685,0.563793,0.517249,0.563119,0.693934
École Florence Nightingale de Bordeaux,0.543137,0.624907,0.578257,1.0,0.644736,0.577523,0.625224,0.685661,0.578211,0.693848,...,0.661376,0.661376,0.721226,0.452152,0.625345,0.660038,0.565326,0.577608,0.564446,0.596392
FLORENCE NIGHTINGALE,0.585522,0.614692,0.608591,0.644736,1.0,0.608062,0.614742,0.608255,0.563659,0.605466,...,0.668025,0.668025,0.651552,0.468074,0.615047,0.574205,0.526773,0.514922,0.526248,0.573662
"Florence Nightingale Hospital for Gentlewomen (London, England)",0.721362,0.640156,0.999984,0.577523,0.608062,1.0,0.639664,0.598366,0.584982,0.614713,...,0.885725,0.885725,0.715417,0.656155,0.639632,0.593976,0.563396,0.516481,0.562728,0.694117
"Nightingale, Florence, 1820-1910.",0.75523,0.999988,0.639703,0.625224,0.614742,0.639664,1.0,0.909191,0.866611,0.62642,...,0.643027,0.643027,0.653407,0.678611,0.999987,0.583054,0.514741,0.485092,0.514067,0.629574
"Nightingale, Florence, 1820-1910",0.698889,0.909452,0.598763,0.685661,0.608255,0.598366,0.909191,1.0,0.851561,0.589629,...,0.637651,0.637651,0.646041,0.631493,0.909449,0.551533,0.505293,0.453351,0.504575,0.573909
"Nightingale, Florence (1820-1910)",0.667694,0.866995,0.585192,0.578211,0.563659,0.584982,0.866611,0.851561,1.0,0.54738,...,0.580134,0.580134,0.569601,0.63082,0.866841,0.513009,0.479451,0.430253,0.47859,0.577097
Florence Nightingale Commemoration Committee.,0.565393,0.626476,0.615287,0.693848,0.605466,0.614713,0.62642,0.589629,0.54738,1.0,...,0.642176,0.642176,0.742734,0.560679,0.626911,0.722634,0.561568,0.562731,0.559916,0.675032


## CPU embeds

In [217]:
import numpy as np
# embeddings = np.load("../data/name_rec_embeddings.npy")
embeddings = np.load("../data/name_rec_embeddings_no_prompt.npy")

In [218]:
embeddings.shape

(646786, 1024)

In [219]:
dedup_data["embedding"] = embeddings.tolist()

In [6]:
dedup_data

Unnamed: 0_level_0,label,id,type,source,embedding
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Children,null00000,missing,subject,"[-0.026071306318044662, 0.06877781450748444, -..."
1,Criminal law,null00001,missing,subject,"[0.010340402834117413, -0.03877517580986023, -..."
2,Gender roles,null00002,missing,subject,"[-0.016230687499046326, -0.03738464042544365, ..."
3,Law,null00003,missing,subject,"[-0.028498228639364243, -0.027923623099923134,..."
4,Police,null00004,missing,subject,"[0.07215391099452972, -0.08754857629537582, -0..."
...,...,...,...,...,...
646781,Pregnant teenagers - Great Britain - Diaries,zzzvtr3f,Concept,subject,"[0.008183090016245842, 0.05751565471291542, -0..."
646782,"Maddox, Lucy",zzzw65be,Person,contributor,"[-0.00245485408231616, 0.045995958149433136, -..."
646783,"Rockliffe, Richard William.",zzzwgbkn,Person,contributor,"[-0.005838651675730944, 0.020258137956261635, ..."
646784,Aneurysm - drug therapy,zzzygqw9,Concept,subject,"[0.05733156576752663, 0.043670717626810074, -0..."


## Test CPU embed F.N. cases

In [220]:
ng_rows = dedup_data[dedup_data['label'].str.contains(
    "florence", case=False, na=False) & dedup_data['label'].str.contains(
    "night", case=False, na=False)]
ng_rows

Unnamed: 0_level_0,label,id,type,source,embedding
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
87516,"Nightingale, Florence, 1820-1910.",d4rsp2n3,Person,contributor,"[0.002191835083067417, 0.08153051882982254, -0..."
87517,"Nightingale, Florence, 1820-1910. Notes on nur...",d4rsp2n3,Agent,contributor,"[-0.0026591746136546135, 0.06633958965539932, ..."
87518,"Nightingale, Florence, 1820-1910. Notes on nur...",d4rsp2n3,Person,subject,"[-0.0026591746136546135, 0.06633958965539932, ..."
87519,"Nightingale, Florence, 1820-1910. Notes on nur...",d4rsp2n3,Agent,contributor,"[-0.0023670648224651814, 0.06425203382968903, ..."
87900,"Florence Nightingale Museum (London, England)",d56fkgss,Organisation,subject,"[-0.035757485777139664, 0.07672874629497528, -..."
91691,École Florence Nightingale de Bordeaux,d9d45nkh,Organisation,contributor,"[-0.0012424219166859984, -0.014957122504711151..."
98931,National Florence Nightingale Memorial Committ...,dhayqztj,Organisation,contributor,"[-0.042052261531353, 0.03748699650168419, -0.0..."
136957,"Nightingale, Florence (1820-1910)",euzka9gm,Agent,contributor,"[0.013298776932060719, 0.0672890841960907, -0...."
136958,"Nightingale, Florence (1820-1910)",euzka9gm,Organisation,subject,"[0.013298776932060719, 0.0672890841960907, -0...."
160627,"Florence Nightingale Hospital (London, England)",fqb5m3hw,Organisation,subject,"[-0.009974309243261814, 0.06574500352144241, -..."


In [221]:
from sklearn.metrics.pairwise import cosine_similarity
import ast

# Convert string representations of arrays to actual numpy arrays


def parse_embedding(embedding):
    try:
        # Try to parse as a list first
        if isinstance(embedding, str):
            return np.array(ast.literal_eval(embedding))
        else:
            return np.array(embedding)
    except BaseException:
        # If parsing fails, assume it's already a numpy array or list
        return np.array(embedding)

ng_embeds = np.vstack([parse_embedding(emb) for emb in ng_rows["embedding"]])

In [222]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(ng_embeds)

### Emedding WITHOUT Instruct

In [223]:
# Create a DataFrame that shows similarity between 2 labels
# similarities = model.similarity(ng_embeds, ng_embeds)
similarity_df = pd.DataFrame(
    similarities,
    columns=ng_rows['label'].values,
    index=ng_rows['label'].values)
similarity_df

Unnamed: 0,"Nightingale, Florence, 1820-1910.","Nightingale, Florence, 1820-1910. Notes on nursing.","Nightingale, Florence, 1820-1910. Notes on nursing..1","Nightingale, Florence, 1820-1910. Notes on nursing. Dutch.","Florence Nightingale Museum (London, England)",École Florence Nightingale de Bordeaux,National Florence Nightingale Memorial Committee of Great Britain and Northern Ireland. Dan Mason Nursing Research Committee,"Nightingale, Florence (1820-1910)","Nightingale, Florence (1820-1910).1","Florence Nightingale Hospital (London, England)",...,Florence Nightingale International Foundation,Florence Nightingale International Foundation.,Florence Nightingale Museum,Florence Nightingale Museum.1,Florence Nightingale Museum.,Florence Nightingale Foundation (1934-),Florence Nightingale School of Nursing and Midwifery,Florence Nightingale International Foundation.1,FLORENCE NIGHTINGALE,"Nightingale, Florence, (former owner), 1820-1910."
"Nightingale, Florence, 1820-1910.",1.0,0.87879,0.87879,0.846542,0.592273,0.558713,0.506615,0.809979,0.809979,0.602203,...,0.555087,0.611069,0.579427,0.579427,0.632476,0.625412,0.587437,0.555087,0.566876,0.907553
"Nightingale, Florence, 1820-1910. Notes on nursing.",0.87879,1.0,1.0,0.957,0.650182,0.604695,0.616772,0.81274,0.81274,0.668627,...,0.633919,0.674277,0.649361,0.649361,0.674953,0.68874,0.68889,0.633919,0.602078,0.835985
"Nightingale, Florence, 1820-1910. Notes on nursing.",0.87879,1.0,1.0,0.957,0.650182,0.604695,0.616772,0.81274,0.81274,0.668627,...,0.633919,0.674277,0.649361,0.649361,0.674953,0.68874,0.68889,0.633919,0.602078,0.835985
"Nightingale, Florence, 1820-1910. Notes on nursing. Dutch.",0.846542,0.957,0.957,1.0,0.611952,0.558677,0.589008,0.771277,0.771277,0.618846,...,0.583906,0.630195,0.599685,0.599685,0.634164,0.636175,0.633538,0.583906,0.551701,0.819782
"Florence Nightingale Museum (London, England)",0.592273,0.650182,0.650182,0.611952,1.0,0.796152,0.683043,0.750607,0.750607,0.921776,...,0.837673,0.831398,0.978361,0.978361,0.963744,0.822782,0.83064,0.837673,0.789211,0.566701
École Florence Nightingale de Bordeaux,0.558713,0.604695,0.604695,0.558677,0.796152,1.0,0.576516,0.73677,0.73677,0.833881,...,0.786577,0.76612,0.796377,0.796377,0.776883,0.761097,0.869114,0.786577,0.747062,0.515739
National Florence Nightingale Memorial Committee of Great Britain and Northern Ireland. Dan Mason Nursing Research Committee,0.506615,0.616772,0.616772,0.589008,0.683043,0.576516,1.0,0.596684,0.596684,0.665241,...,0.692236,0.713151,0.674504,0.674504,0.685344,0.713266,0.655946,0.692236,0.544391,0.505979
"Nightingale, Florence (1820-1910)",0.809979,0.81274,0.81274,0.771277,0.750607,0.73677,0.596684,1.0,1.0,0.760939,...,0.74228,0.749105,0.758382,0.758382,0.75139,0.767847,0.750491,0.74228,0.834029,0.807996
"Nightingale, Florence (1820-1910)",0.809979,0.81274,0.81274,0.771277,0.750607,0.73677,0.596684,1.0,1.0,0.760939,...,0.74228,0.749105,0.758382,0.758382,0.75139,0.767847,0.750491,0.74228,0.834029,0.807996
"Florence Nightingale Hospital (London, England)",0.602203,0.668627,0.668627,0.618846,0.921776,0.833881,0.665241,0.760939,0.760939,1.0,...,0.862596,0.863235,0.898374,0.898374,0.890234,0.859194,0.877587,0.862596,0.776327,0.574304


### Emedding with Instruct

In [None]:
# Create a DataFrame that shows similarity between 2 labels
# similarities = model.similarity(ng_embeds, ng_embeds)
similarity_df = pd.DataFrame(
    similarities,
    columns=ng_rows['label'].values,
    index=ng_rows['label'].values)
similarity_df

Unnamed: 0,"Nightingale, Florence, 1820-1910.","Nightingale, Florence, 1820-1910. Notes on nursing.","Nightingale, Florence, 1820-1910. Notes on nursing..1","Nightingale, Florence, 1820-1910. Notes on nursing. Dutch.","Florence Nightingale Museum (London, England)",École Florence Nightingale de Bordeaux,National Florence Nightingale Memorial Committee of Great Britain and Northern Ireland. Dan Mason Nursing Research Committee,"Nightingale, Florence (1820-1910)","Nightingale, Florence (1820-1910).1","Florence Nightingale Hospital (London, England)",...,Florence Nightingale International Foundation,Florence Nightingale International Foundation.,Florence Nightingale Museum,Florence Nightingale Museum.1,Florence Nightingale Museum.,Florence Nightingale Foundation (1934-),Florence Nightingale School of Nursing and Midwifery,Florence Nightingale International Foundation.1,FLORENCE NIGHTINGALE,"Nightingale, Florence, (former owner), 1820-1910."
"Nightingale, Florence, 1820-1910.",1.0,0.925263,0.925263,0.890149,0.693618,0.664413,0.512054,0.982165,0.982165,0.722673,...,0.653484,0.658004,0.689266,0.689266,0.69481,0.666157,0.683501,0.653484,0.742062,0.932857
"Nightingale, Florence, 1820-1910. Notes on nursing.",0.925263,1.0,1.0,0.946127,0.697374,0.697171,0.560204,0.914476,0.914476,0.775583,...,0.669717,0.678563,0.691334,0.691334,0.69902,0.683695,0.74464,0.669717,0.69059,0.859519
"Nightingale, Florence, 1820-1910. Notes on nursing.",0.925263,1.0,1.0,0.946127,0.697374,0.697171,0.560204,0.914476,0.914476,0.775583,...,0.669717,0.678563,0.691334,0.691334,0.69902,0.683695,0.74464,0.669717,0.69059,0.859519
"Nightingale, Florence, 1820-1910. Notes on nursing. Dutch.",0.890149,0.946127,0.946127,1.0,0.677562,0.668935,0.542938,0.880504,0.880504,0.740838,...,0.656662,0.66295,0.671826,0.671826,0.677907,0.659768,0.70469,0.656662,0.686907,0.835765
"Florence Nightingale Museum (London, England)",0.693618,0.697374,0.697374,0.677562,1.0,0.694329,0.641882,0.698719,0.698719,0.857168,...,0.727701,0.734381,0.97028,0.97028,0.971059,0.706168,0.708997,0.727701,0.652932,0.676436
École Florence Nightingale de Bordeaux,0.664413,0.697171,0.697171,0.668935,0.694329,1.0,0.545673,0.67344,0.67344,0.748602,...,0.663798,0.666488,0.716675,0.716675,0.713384,0.667457,0.764543,0.663798,0.639627,0.627374
National Florence Nightingale Memorial Committee of Great Britain and Northern Ireland. Dan Mason Nursing Research Committee,0.512054,0.560204,0.560204,0.542938,0.641882,0.545673,1.0,0.511321,0.511321,0.616559,...,0.617645,0.622067,0.63232,0.63232,0.632961,0.604971,0.553886,0.617645,0.493192,0.480599
"Nightingale, Florence (1820-1910)",0.982165,0.914476,0.914476,0.880504,0.698719,0.67344,0.511321,1.0,1.0,0.736177,...,0.6654,0.668257,0.696861,0.696861,0.699334,0.66192,0.688361,0.6654,0.77379,0.924203
"Nightingale, Florence (1820-1910)",0.982165,0.914476,0.914476,0.880504,0.698719,0.67344,0.511321,1.0,1.0,0.736177,...,0.6654,0.668257,0.696861,0.696861,0.699334,0.66192,0.688361,0.6654,0.77379,0.924203
"Florence Nightingale Hospital (London, England)",0.722673,0.775583,0.775583,0.740838,0.857168,0.748602,0.616559,0.736177,0.736177,1.0,...,0.762835,0.774491,0.812412,0.812412,0.80882,0.754154,0.780479,0.762835,0.659891,0.699336


# Faiss Indexing

In [9]:
# make to_index dataframe that only contains rows of type "Person"
to_index = dedup_data[dedup_data['type'].isin(["Person", "Agent"])].reset_index(drop=False)
to_index

Unnamed: 0,idx,label,id,type,source
0,9,"Nelson, Geoffrey B. (Geoffrey Brian)",a223f5a6,Person,contributor
1,10,"Wolff, G.",a2249bxm,Person,contributor
2,11,"Jones, John E",a224b9mp,Person,contributor
3,12,"Jones, John E.",a224b9mp,Person,contributor
4,13,"Hulverscheidt, Marion, 1970-",a224rx5x,Person,contributor
...,...,...,...,...,...
414716,646779,"Visser, H. K. A.",zzzrhwkq,Person,contributor
414717,646780,"Buchanan, Roger B.",zzzuuq8v,Person,contributor
414718,646782,"Maddox, Lucy",zzzw65be,Person,contributor
414719,646783,"Rockliffe, Richard William.",zzzwgbkn,Person,contributor


In [283]:
import faiss
import numpy as np

# Convert embeddings to numpy array if they're in list format
embeddings_array = np.array(to_index["embedding"].tolist()).astype('float32')
# faiss.normalize_L2(embeddings_array)

print(f"Embeddings shape: {embeddings_array.shape}")
print(f"Embedding dimension: {embeddings_array.shape[1]}")

# Create FAISS index - using IndexFlatIP for inner product (cosine similarity)
# For exact search, we can use IndexFlatIP or IndexFlatL2
dimension = embeddings_array.shape[1]
index = faiss.IndexFlatIP(dimension)

# Normalize the embeddings for cosine similarity
faiss.normalize_L2(embeddings_array)

# Add embeddings to the index
index.add(embeddings_array)

print(f"Index created with {index.ntotal} vectors")
print(f"Index type: {type(index)}")

Embeddings shape: (414721, 1024)
Embedding dimension: 1024
Index created with 414721 vectors
Index type: <class 'faiss.swigfaiss_avx2.IndexFlatIP'>


# Load FAISS index

In [10]:
import faiss

# Save the FAISS index to disk for future use
index_path = "../data/name_rec_faiss.index"

# faiss.write_index(index, index_path)
# print(f"FAISS index saved to: {index_path}")

# To load the index later, you can use:
index = faiss.read_index(index_path)
# loaded_embeddings = np.load(embeddings_path)

In [11]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: Qwen/Qwen3-Embedding-0.6B
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'document']


In [13]:
embed_prompt = "Instruct: Given a search query that contains a person's name, retrieve relevant passages that mention the same person.\nQuery: "

# Function to search for similar embeddings

def search_similar_embeddings(query_text, k=5):
    """
    Search for the k most similar embeddings to a given query text
    """
    # Find the query embedding by matching the text
    query_idx = to_index[to_index['label'] == query_text].index
    
    if len(query_idx) == 0:
        print(f"Query text '{query_text}' not found in dataset")
        return None
    
    # query_idx = query_idx[0]
    # query_embedding = embeddings_array[query_idx:query_idx+1]

    query_embedding = model.encode([query_text], prompt=embed_prompt)
    
    # Search for k most similar embeddings
    similarities, indices = index.search(query_embedding, k)
    
    results = []
    for i, (sim, idx) in enumerate(zip(similarities[0], indices[0])):
        results.append({
            'rank': i + 1,
            'similarity': sim,
            'index': idx,
            'label': to_index.iloc[idx]['label'],
            'id': to_index.iloc[idx]['id']
        })
    
    return results


# Test the search function
# test_query = to_index['label'].iloc[random.randint(0, len(to_index)-1)]  # Use random label as test query
# test_query = "Nightingale, Florence, 1820-1910."  # Use first label as test query
# test_query = "FLORENCE NIGHTINGALE"  # Use first label as test query
# test_query = "James, Susan, 1951-"
test_query = "Freud, Sigmund, 1856-1939"
print(f"Searching for similar embeddings to: '{test_query}'")
results = search_similar_embeddings(test_query, k=100)

if results:
    import pandas as pd
    results_df = pd.DataFrame(results)
    print("\nTop 10 similar embeddings:")
results_df

Searching for similar embeddings to: 'Freud, Sigmund, 1856-1939'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Top 10 similar embeddings:


Unnamed: 0,rank,similarity,index,label,id
0,1,0.854170,373591,"Freud, Sigmund, 1856-1939.",xru69728
1,2,0.854170,362515,"Freud, Sigmund, 1856-1939.",x5s96gsv
2,3,0.854170,359059,"Freud, Sigmund, 1856-1939.",wxtz24sc
3,4,0.854170,357045,"Freud, Sigmund, 1856-1939.",wua25kwp
4,5,0.854170,349799,"Freud, Sigmund, 1856-1939.",weyszm42
...,...,...,...,...,...
95,96,0.574617,149992,"Freudenberg, Ernst, 1884-1967.",jceq6pvz
96,97,0.573546,126993,"Freudenberg, Dr.",h3jm35rp
97,98,0.573023,136149,"Freudenberg, Rudolf",hkm8vuf4
98,99,0.573023,83083,"Freudenberg, Rudolf",emuvj3rk


In [None]:
# Quick math to set an initial cutoff

0.8541701 * 0.7

0.59791907

# LLM Reconcile

## Pairwise Reconcile

In [3]:
from wc_simd.llm import ChatSession

def check_same(names):

    session = ChatSession(system_instruction="""You are to be given 2 names.
    If you are certain that they are the same person, respond YES.
    If you are not sure, respond NO.""")

    response = session.send("\n".join(names), stdout=True)

    return response[0]

In [4]:
check_same(["James, Sue, 1952-", "James, Susan, 1951-"])

Assistant: <reasoning>We need to determine if "James, Sue, 1952-" and "James, Susan, 1951-" refer to the same person. The names: "James, Sue" vs "James, Susan". Sue is often a nickname for Susan. The years: 1952- vs 1951-. Different birth years. Could be a discrepancy. The question: "You are to be given 2 names. If you are certain that they are the same person, respond YES. If you are not sure, respond NO." So we need to be certain. Here we have different birth years (1952 vs 1951). That suggests they are likely different individuals. However could be a typo? But we cannot be certain they are same. So answer NO.</reasoning>NO


'NO'

In [5]:
check_same(["Godard, Ernest, 1827-1862.", "Godard Ernest, 1827-1862."])

Assistant: <reasoning>The task: given two names: "Godell, Ernest, 1827-1862." and "Godard Ernest, 1827-1862." Determine if they refer to same person. The first name: "Godell, Ernest, 1827-1862." The second: "Godard Ernest, 1827-1862." The surnames differ: Godell vs Godard. Could be a typo? The years match. The first includes a comma after Godell, then Ernest. The second is "Godard Ernest". The question: Are they the same person? Likely they are the same person but with a spelling variation. However, we need to be certain. Let's think: Is there a known person named Ernest Godell? Not known. Ernest Godard? There is a French painter "Ernest Godard"? Not sure. The years 1827-1862 correspond to a French painter "Ernest Godard"? Actually there is a French painter "Ernest Godard" (1827-1862) maybe. Let's search memory: I recall "Ernest Godard" is a French painter of the 19th century. Not sure about "Godell". Could be a misreading. The question likely expects YES because they are same person w

'YES'

In [42]:
check_same([
    "Swaine, W. E. (William Edward), 1805-",
    "Swaine, William Edward, 1804-1868."
])

INFO:langchain_aws.llms.bedrock:Using Bedrock Invoke API to generate response


Assistant: <reasoning>We need to determine if the two names refer to the same person. Names:

1. "Swaine, W. E. (William Edward), 1805-"
2. "Swan e, William Edward, 1804-1868."

Actually second: "Swaine, William Edward, 1804-1868."

First: "Swaine, W. E. (William Edward), 1805-" (no death year given). Could be same person with slightly different birth year (1804 vs 1805) and missing death year. Both have same surname Swaine, same given names William Edward. Likely same person. The slight discrepancy in birth year could be due to source variation. So answer YES.</reasoning>YES


'YES'

## Batch reconcile

### Interesting cases:
- Freud, Sigmund, 1856-1939

In [3]:
# Function to search for similar embeddings
import random
from wc_simd.llm import ChatSession
import json

def get_similar_embeddings(query_text, k=5):
    """
    Search for the k most similar embeddings to a given query text
    """
    # Find the query embedding by matching the text
    query_idx = to_index[to_index['label'] == query_text].index

    if len(query_idx) == 0:
        print(f"Query text '{query_text}' not found in dataset")
        return None

    # query_idx = query_idx[0]
    # query_embedding = embeddings_array[query_idx:query_idx+1]

    query_embedding = model.encode([query_text], prompt=embed_prompt)

    # Search for k most similar embeddings
    similarities, indices = index.search(query_embedding, k)

    max_similarity = similarities[0][0] if len(similarities) > 0 else 0.0
    cutoff_similarity = max_similarity * 0.7  # Set a cutoff similarity threshold

    results = []
    for i, (sim, idx) in enumerate(zip(similarities[0], indices[0])):
        if sim >= cutoff_similarity:
            results.append({
                'label': to_index.iloc[idx]['label'],
            'index': f"{to_index.iloc[idx]['id']}_{to_index.iloc[idx]['idx']}"  
        })

    # To csv in string buffer
    csv_string = pd.DataFrame(results).to_csv(index=False, header=False, sep=",", encoding="utf-8")
    print(csv_string)

    chat = ChatSession(system_instruction="""# INSTRUCTIONS
You will be given a target name of a person and a CSV string with the following format:

name,index

## Goal
Identify all names in the CSV that can be reconciled to the target name and return their indices.

## Rules
- Take in account initials and dates to disambiguate.
- If there is a match for a full name but no date, we take it that is is disambiguated to a medium degree.
- If the match depends on initials it is disambiguated to a low degree unless there is a date match.
- If the name cannot be disambiguated to a medium degree, do not return the index.
- If the target name is too ambiguous, do not return any indices.

## Output Format
Return a JSON list with the following format:

[idx1, idx2, ...]
}
""")

    result = chat.send(f"Name: {query_text}\nCSV:\n{csv_string}", stdout=True)
    return json.loads(result[0])


# test_query = to_index['label'].iloc[random.randint(0, len(to_index) - 1)]  # Use random label as test query

# test_query = "Freud, Sigmund, 1856-1939"
# test_query = "John"
test_query = "Harris, Michael R., 1941-"
# test_query = "Creutz, Gustaf Filip, Greve, 1731-1785."
# test_query = "Jacobi, G., 1840-1906."

print(f"Searching for similar embeddings to: '{test_query}'")
results = get_similar_embeddings(test_query, k=100)
print(len(results), "results found")
results

Searching for similar embeddings to: 'Harris, Michael R., 1941-'


NameError: name 'model' is not defined

In [17]:
500000 * 3 / 60 / 60 / 24

17.36111111111111

## Alternative FAISS Index Types

For larger datasets, you might want to consider different index types:

- **IndexFlatL2**: Exact L2 distance search (good for smaller datasets)
- **IndexFlatIP**: Exact inner product search (good for cosine similarity)
- **IndexIVFFlat**: Inverted file index for faster approximate search
- **IndexHNSW**: Hierarchical Navigable Small World graph for very fast approximate search
- **IndexLSH**: Locality Sensitive Hashing for binary embeddings

The current implementation uses `IndexFlatIP` for exact cosine similarity search.

In [None]:
# Example: Create an approximate index for larger datasets
# Uncomment and modify as needed for your use case

# # Create IVF index for faster approximate search
# nlist = 100  # number of clusters
# quantizer = faiss.IndexFlatIP(dimension)
# ivf_index = faiss.IndexIVFFlat(quantizer, dimension, nlist)
# 
# # Train the index (required for IVF)
# ivf_index.train(embeddings_array)
# ivf_index.add(embeddings_array)
# 
# # Set search parameters
# ivf_index.nprobe = 10  # number of clusters to search
# 
# print(f"IVF Index created with {ivf_index.ntotal} vectors")

# # Example: Create HNSW index for very fast approximate search
# hnsw_index = faiss.IndexHNSWFlat(dimension, 32)  # 32 is the number of connections
# hnsw_index.add(embeddings_array)
# print(f"HNSW Index created with {hnsw_index.ntotal} vectors")

# Evaluation Significance

In [5]:
# Sample size calculation for estimating a proportion with finite population correction
# Formula (initial, infinite population): n0 = (Z^2 * p * (1 - p)) / E^2
# Finite population correction (FPC): n = n0 / (1 + (n0 - 1)/N)
# Where:
#   Z = Z-score for desired confidence (e.g., 1.96 for 95%)
#   p = estimated proportion (use 0.5 for maximum variance / conservative sample size if unknown)
#   E = desired margin of error (absolute, e.g., 0.05 for ±5%)
#   N = population size (len(to_index))

from math import ceil, sqrt

try:
    from scipy.stats import norm  # Optional, fallback to lookup if not available
    _has_scipy = True
except ImportError:
    _has_scipy = False

CONFIDENCE_Z = {
    0.80: 1.2816,
    0.85: 1.4395,
    0.90: 1.6449,
    0.95: 1.96,
    0.975: 2.24,  # two-sided 95% per tail ~ 0.4875 each (rarely used)
    0.98: 2.3263,
    0.99: 2.5758,
    0.995: 2.8070,
    0.999: 3.2905,
}

def z_for_confidence(confidence_level: float) -> float:
    """Return Z-score for a given two-sided confidence level.
    Uses scipy if available for arbitrary CL; otherwise falls back to lookup dict.
    """
    if _has_scipy:
        # For two-sided interval, tail probability = (1 - CL)/2
        from scipy.stats import norm
        alpha = 1 - confidence_level
        return norm.ppf(1 - alpha/2)
    # Fallback: nearest key in dict
    closest = min(CONFIDENCE_Z.keys(), key=lambda k: abs(k - confidence_level))
    return CONFIDENCE_Z[closest]


def required_sample_size(
    population_size: int,
    confidence_level: float = 0.95,
    margin_of_error: float = 0.05,
    estimated_proportion: float = 0.5,  # p_hat, use 0.5 for the most conservative estimate
    design_effect: float = 1.0,
    min_samples: int = 30,
) -> dict:
    """Compute the required sample size with finite population correction.

    Returns a dict with details.
    - design_effect > 1 inflates size for clustering / complex sampling.
    - estimated_proportion (p) defaults to 0.5 for conservative estimate.
    """
    if not (0 < margin_of_error < 1):
        raise ValueError("margin_of_error should be between 0 and 1 (e.g., 0.05 for 5%)")
    if not (0 < estimated_proportion < 1):
        raise ValueError("estimated_proportion must be in (0,1)")
    if population_size <= 0:
        raise ValueError("population_size must be positive")

    Z = z_for_confidence(confidence_level)
    p = estimated_proportion
    E = margin_of_error

    # Initial (infinite population) sample size
    n0 = (Z**2 * p * (1 - p)) / (E**2)

    # Apply design effect
    n0 *= design_effect

    # Finite population correction
    n = n0 / (1 + (n0 - 1) / population_size)

    # Ensure integer and at least min_samples
    n_ceiled = max(min_samples, ceil(n))

    return {
        "population_size": population_size,
        "confidence_level": confidence_level,
        "margin_of_error": E,
        "estimated_proportion": p,
        "design_effect": design_effect,
        "z_score": Z,
        "n0_infinite": n0,
        "n_fpc": n,
        "required_sample_size": n_ceiled,
    }

# --- Usage ---
N = len(to_index)
scenarios = [
    {"confidence_level": 0.95, "margin_of_error": 0.05, "estimated_proportion": 0.5},
    {"confidence_level": 0.95, "margin_of_error": 0.03, "estimated_proportion": 0.5},
    {"confidence_level": 0.99, "margin_of_error": 0.05, "estimated_proportion": 0.5},
]

import pandas as pd
rows = []
for s in scenarios:
    res = required_sample_size(population_size=N, **s)
    rows.append({
        "confidence": s["confidence_level"],
        "margin_error": s["margin_of_error"],
        "p_est": s["estimated_proportion"],
        "sample_size": res["required_sample_size"],
        "n_fpc": round(res["n_fpc"], 2),
        "n0_infinite": round(res["n0_infinite"], 2),
    })

sample_size_df = pd.DataFrame(rows)
print(f"Population size (N) = {N}")
print("Suggested sample sizes (rounded):")
sample_size_df

Population size (N) = 414721
Suggested sample sizes (rounded):


Unnamed: 0,confidence,margin_error,p_est,sample_size,n_fpc,n0_infinite
0,0.95,0.05,0.5,384,383.79,384.15
1,0.95,0.03,0.5,1065,1064.34,1067.07
2,0.99,0.05,0.5,663,662.43,663.49


In [21]:
# Quick calculation for non trivial reconcilliations

experiment_params = sample_size_df.iloc[1]

# Current ratio
observed_ratio_of_non_full_text_reconcilliations = 145 / 400

target_n = ceil(experiment_params["sample_size"])

samples_needed_for_non_full_text_reconcillations = ceil(target_n / observed_ratio_of_non_full_text_reconcilliations)
print("observed_ratio_of_non_full_text_reconcilliations",
      observed_ratio_of_non_full_text_reconcilliations)
print("target_n", target_n)
print("samples_needed_for_non_full_text_reconcillations",
      samples_needed_for_non_full_text_reconcillations)

confidence = experiment_params["confidence"]
margin_of_error = experiment_params["margin_error"]
p_hat = 0.9

lower_p_hat = p_hat - margin_of_error
upper_p_hat = p_hat + margin_of_error

ci_statement = f"""
With n={target_n:.0f} and an observed p_hat={p_hat},
about 95% of future studies of the same size would produce p_hat between {lower_p_hat} and {upper_p_hat};
and a {(confidence * 100):.0f}% Wilson confidence interval for the true p from this study is [{lower_p_hat}, {upper_p_hat}]
"""

print(ci_statement)

observed_ratio_of_non_full_text_reconcilliations 0.3625
target_n 1065
samples_needed_for_non_full_text_reconcillations 2938

With n=1065 and an observed p_hat=0.9,
about 95% of future studies of the same size would produce p_hat between 0.87 and 0.93;
and a 95% Wilson confidence interval for the true p from this study is [0.87, 0.93]

