In [None]:
import re
import copy
import glob 
import math
import itertools
import jellyfish
from tqdm import tqdm
import pandas as pd
import numpy as np
from datetime import datetime
import xml.etree.ElementTree as ET

ns = {'xml': 'http://www.w3.org/XML/1998/namespace',
      'dflt': 'http://www.tei-c.org/ns/1.0',
      'frus':'http://history.state.gov/frus/ns/1.0',
      'xi':'http://www.w3.org/2001/XInclude'
      }

In [None]:
def extract_person(item, file):
    volume = file[8:-4]

    persName_item = item.find('.//dflt:persName[@xml:id]', ns)

    if persName_item is not None:

        persName_text = "".join(persName_item.itertext())
        person_id = persName_item.attrib['{http://www.w3.org/XML/1998/namespace}id']

        all_text = "".join(item.itertext())
        end_idx = all_text.find(persName_text) + len(persName_text+',')
        person_descp = " ".join(all_text[end_idx:].split())

        person_name = " ".join(re.sub(',',''," ".join(persName_text.split(', ')[::-1])).split())

        person_id = volume + '_' + person_id

        global person_df
        person_df = pd.concat((person_df, pd.DataFrame({'id':[person_id],
                                                    'name':[person_name],
                                                    'description':[person_descp]})),ignore_index=True)
    return

In [None]:
start_year, end_year = 1952, 1988

person_df = pd.DataFrame(columns=['id','name','description'])

for file in glob.glob('volumes/frus*'):
    file_start_year = int(file[12:16])
    
    if file_start_year >= start_year and file_start_year<=end_year:

        tree = ET.parse(file)
        root = tree.getroot()
        persons_section = root.find("./dflt:text/dflt:front//dflt:div[@xml:id='persons']", ns)
        
        if persons_section:
            for item in persons_section.findall('.//dflt:item/dflt:hi/dflt:persName[@xml:id]/../..', ns):
                extract_person(item,file)
            for item in persons_section.findall('.//dflt:item/dflt:persName[@xml:id]/..', ns):
                extract_person(item,file)
        else:
            print(f'No person annotation in {file}.')

#### step 1: reduce exactly matched names

In [None]:
unified_person_dict = {}

In [None]:
def aux(row):
    global unified_person_dict

    if row['name'] in unified_person_dict:
      
      temp_dict = unified_person_dict[row['name']]

      temp_dict['id_list'].append(row['id'])
      temp_dict['description_list'].append(row['description'])
    
    else:
      unified_person_dict[row['name']]= {'id_list':[row['id']],
                                        'description_list':[row['description']]}

    return


In [None]:
person_df.apply(lambda x:aux(x), axis=1)

In [None]:
unified_person_df = pd.DataFrame.from_dict(unified_person_dict,orient='index').reset_index(drop=False)
unified_person_df.rename(columns={'index':'name'}, inplace=True)

#### step 2: reduce names with exactly same words but different combinations

In [None]:
unified_person_df['name_set'] = unified_person_df.name.apply(lambda x: " ".join(sorted(x.split())))

In [None]:
new_unified_person_dict = {}

def aux2(row):
    global new_unified_person_dict

    if row['name_set'] in new_unified_person_dict:
      
        temp_dict = new_unified_person_dict[row['name_set']]

        temp_dict['name_list'].append(row['name'])
        temp_dict['id_list'] += row['id_list']
        temp_dict['description_list'] += row['description_list']
    
    else:
        new_unified_person_dict[row['name_set']]= {'name_list':[row['name']],
                                                    'id_list':row['id_list'],
                                                    'description_list':row['description_list']}

    return

In [None]:
unified_person_df.apply(lambda x:aux2(x), axis=1)

new_unified_person_df = pd.DataFrame.from_dict(new_unified_person_dict,orient='index').reset_index(drop=False)
new_unified_person_df.rename(columns={'index':'name_set'}, inplace=True)

In [None]:
# just for observation
new_unified_person_df[new_unified_person_df['name_list'].apply(lambda x: len(x)==2)]

#### step 3: find and reduce near-duplicate names + obvious misspellings

In [None]:
# step one: (match len>=2 and each word len>=3), edit distance based matching
# step two: find misspelling matches with edit distance of 1 e.g. for Ziegler vs Zeigler

# caution!!!
# Eliot Jr. L. Theodore, and D. Dwight Eisenhower
# Georges Guay R. vs George Guay R.
# Abrams Creighton General Major W.
# Aharon General Major Yariv

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
all_names = new_unified_person_df['name_set'].values

def compute_sim(s1,func,s2):
    return func(s1,s2)

def compute_exact_word_overlap(s1,s2):
    l1 = set([x for x in list(set(tokenizer.tokenize(s1))) if len(x)>=3])
    l2 = set([x for x in list(set(tokenizer.tokenize(s2))) if len(x)>=3])

    return len(l1.intersection(l2))

def find_matches(s2):

    spiro_dist_df = pd.DataFrame({'name_set':all_names,
                                'overlap_cnt':[compute_exact_word_overlap(x,s2) for x in all_names],
                                'dam_lev_dist':[compute_sim(x, jellyfish.damerau_levenshtein_distance,s2) for x in all_names],
                                'jaro_sim':[compute_sim(x, jellyfish.jaro_winkler_similarity,s2) for x in all_names]})
    
    # addition to original matching criteria
    misspelling_idx = set(spiro_dist_df[(spiro_dist_df['dam_lev_dist'] <=1)].index.values)

    spiro_dist_df = spiro_dist_df[spiro_dist_df['overlap_cnt']>=2]
    match_idx = set(spiro_dist_df[(spiro_dist_df['jaro_sim'] >= 0.9) | (spiro_dist_df['dam_lev_dist'] <=5)].index.values)

    return match_idx.union(misspelling_idx)

In [None]:
t = {}
for idx in tqdm(range(len(all_names))):
    name = all_names[idx]
    t[idx]=find_matches(name)

In [None]:
scratch_t = copy.deepcopy(t)
changed_flag = True

while changed_flag:

    changed_flag = False

    for key in t:
        
        for matched_idx in t[key]:

            if key != matched_idx:
                if scratch_t.get(key, None) and scratch_t.get(matched_idx, None):
                    changed_flag = True
                    t[key] = t[key].union(t[matched_idx])
                    scratch_t.pop(matched_idx, None)
        
    unwanted = set(t.keys()) - set(scratch_t.keys())
    print(f'removing {len(unwanted)} keys.')
    for unwanted_key in unwanted: del t[unwanted_key]
    scratch_t = copy.deepcopy(t)
    print('---')
    

In [None]:
for temp_key in t:
    
    te_df = new_unified_person_df.iloc[list(t[temp_key])]

    name_list = list(itertools.chain.from_iterable(te_df['name_list'].values))
    id_list = list(itertools.chain.from_iterable(te_df['id_list'].values))
    description_list = list(itertools.chain.from_iterable(te_df['description_list'].values))

    new_unified_person_df.at[temp_key, 'name_list'] = name_list
    new_unified_person_df.at[temp_key, 'id_list'] = id_list
    new_unified_person_df.at[temp_key, 'description_list'] = description_list

new_unified_person_df = new_unified_person_df.loc[t.keys()]

In [None]:
new_unified_person_df[new_unified_person_df['name_list'].apply(lambda x: len(x)>=2)]['name_list'].sample(10).values

In [None]:
# save unified person table
new_unified_person_df.to_parquet('tables/tables_52_88/new_unified_person_df.parquet')

#### step 4: find each person's wikidata entity

In [1]:
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from SPARQLWrapper import SPARQLWrapper, JSON

from tqdm import tqdm
tqdm.pandas()

import ssl
ssl._create_default_https_context = ssl._create_unverified_context


user_agent = 'CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)'

sparqlwd = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)
sparqlwd.setReturnFormat(JSON)

In [2]:
def find_wiki_entity(name):

    try:
        query = """
        SELECT ?item WHERE {
        SERVICE wikibase:mwapi {
            bd:serviceParam wikibase:endpoint "www.wikidata.org";
                            wikibase:api "EntitySearch";
                            mwapi:search  \'"""+name+"""\';
                            mwapi:language "en".
            ?item wikibase:apiOutputItem mwapi:item.
            ?num wikibase:apiOrdinal true.
        }
        ?item wdt:P31 wd:Q5
        }
        """
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {name}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_name_list(row):

    name_list = row['name_list']

    wiki_tag = set()

    for name in name_list:
        res = find_wiki_entity(name)

        for binding in res['results']['bindings']:
            wiki_tag.add(binding['item']['value'])

    return list(wiki_tag)

In [3]:
new_unified_person_df = pd.read_parquet('tables/tables_52_88/new_unified_person_df.parquet')
wiki_col = new_unified_person_df.progress_apply(lambda x: process_name_list(x),axis=1)
new_unified_person_df['wiki_col'] = wiki_col
new_unified_person_df.to_parquet('tables/tables_52_88/new_unified_person_df_wikicol.parquet')

  0%|          | 53/13317 [00:54<4:09:10,  1.13s/it]

name: George C. Denny
error message: Remote end closed connection without response
name: Jr. George C. Denny
error message: <urlopen error [Errno 54] Connection reset by peer>


 87%|████████▋ | 11537/13317 [1:26:52<09:17,  3.19it/s] 

name: David Loving
error message: Remote end closed connection without response


 87%|████████▋ | 11538/13317 [1:26:53<10:08,  2.93it/s]

name: Italo A. Luder
error message: <urlopen error [Errno 54] Connection reset by peer>


100%|██████████| 13317/13317 [1:36:08<00:00,  2.31it/s]


#### step 5: reduce multiple candidate wikidata entities to single using sbert for each person, if exists

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer,util

model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# helpers for using sbert for deciding among wikidata entries
def get_entity_descp(Q):

    try:
        query = """
        SELECT ?descp
        WHERE 
        {
        wd:"""+Q+""" schema:description ?descp.
        FILTER ( lang(?descp) = "en" )
        }"""
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {Q}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_candidate_entities(row):

    q_list = row['wiki_col']
    
    wiki_descp = []

    for q in q_list:
        
        res = get_entity_descp(q.split('/')[-1])
        
        if len(res['results']['bindings'])==0:
            wiki_descp.append('')
        else:      
            for binding in res['results']['bindings']:

                wiki_descp.append(binding['descp']['value'])

    return wiki_descp

In [10]:
def process_wiki_col(row):

    wiki_col = row['wiki_col']
    
    if len(wiki_col)==0:
        return None

    elif len(wiki_col)==1:
        return wiki_col[0]

    else:
        desc_list = row['description_list']
        frus_embedding = np.mean(model.encode(desc_list), axis=0)

        wiki_descs = process_candidate_entities(row)
        wiki_embeddings = model.encode(wiki_descs)

        cos_sim = util.cos_sim(frus_embedding, wiki_embeddings)

        selected_idx = np.argmax(cos_sim,axis=1)[0]
        
        return row["wiki_col"][selected_idx]

In [11]:
new_unified_person_df_wikicol = pd.read_parquet('tables/tables_52_88/new_unified_person_df_wikicol.parquet')
selected_wiki_entity = new_unified_person_df.progress_apply(lambda x: process_wiki_col(x),axis=1)

new_unified_person_df['selected_wiki_entity'] = selected_wiki_entity
new_unified_person_df.to_parquet('tables/tables_52_88/new_unified_person_df_sbert.parquet')

 18%|█▊        | 2413/13317 [27:23<1:20:16,  2.26it/s] 

name: Q2707822
error message: [Errno 54] Connection reset by peer
name: Q96246951
error message: <urlopen error [Errno 54] Connection reset by peer>


 45%|████▍     | 5965/13317 [49:31<08:09, 15.02it/s]  

name: Q959580
error message: HTTP Error 504: Gateway Timeout


 46%|████▌     | 6153/13317 [51:21<36:37,  3.26it/s]  

name: Q76165936
error message: Remote end closed connection without response


 49%|████▉     | 6536/13317 [54:31<07:14, 15.60it/s]  

name: Q105692850
error message: HTTP Error 504: Gateway Timeout


 49%|████▉     | 6537/13317 [55:22<1:42:11,  1.11it/s]

name: Q110022064
error message: <urlopen error [Errno 54] Connection reset by peer>


 50%|█████     | 6688/13317 [55:34<09:45, 11.32it/s]  

name: Q99693179
error message: Remote end closed connection without response
name: Q102138869
error message: <urlopen error [Errno 54] Connection reset by peer>


 73%|███████▎  | 9741/13317 [1:08:15<12:59,  4.58it/s]  

name: Q18151892
error message: [Errno 54] Connection reset by peer
name: Q213550
error message: <urlopen error [Errno 54] Connection reset by peer>


 73%|███████▎  | 9747/13317 [1:08:41<1:08:58,  1.16s/it]

name: Q96207190
error message: HTTP Error 504: Gateway Timeout


 74%|███████▍  | 9841/13317 [1:12:30<2:15:00,  2.33s/it] 

name: Q24060518
error message: HTTP Error 504: Gateway Timeout


 77%|███████▋  | 10258/13317 [1:13:59<17:59,  2.83it/s] 

name: Q55769789
error message: HTTP Error 504: Gateway Timeout


 80%|███████▉  | 10645/13317 [1:17:02<09:32,  4.67it/s]  

name: Q2958645
error message: HTTP Error 504: Gateway Timeout


 89%|████████▉ | 11904/13317 [1:22:32<03:27,  6.81it/s]  

name: Q55979684
error message: HTTP Error 504: Gateway Timeout


100%|██████████| 13317/13317 [1:30:15<00:00,  2.46it/s]


#### step 6: reduce names with exactly same wikidata entries

In [12]:
new_unified_person_df = pd.read_parquet('tables/tables_52_88/new_unified_person_df_sbert.parquet')

In [13]:
t = {}

for idx, key in new_unified_person_df.iterrows():

    ent = key['selected_wiki_entity']

    if not ent:
        t[idx]=set([idx])
    else:
        t[idx]=set(new_unified_person_df[new_unified_person_df['selected_wiki_entity']==ent].index)

In [14]:
import copy
import itertools

scratch_t = copy.deepcopy(t)
changed_flag = True

while changed_flag:

    changed_flag = False

    for key in t:
        
        for matched_idx in t[key]:

            if key != matched_idx:
                if scratch_t.get(key, None) and scratch_t.get(matched_idx, None):
                    changed_flag = True
                    t[key] = t[key].union(t[matched_idx])
                    scratch_t.pop(matched_idx, None)
        
    unwanted = set(t.keys()) - set(scratch_t.keys())
    print(f'removing {len(unwanted)} keys.')
    for unwanted_key in unwanted: del t[unwanted_key]
    scratch_t = copy.deepcopy(t)
    print('---')

removing 241 keys.
---
removing 0 keys.
---


In [15]:
for temp_key in t:
    
    te_df = new_unified_person_df.loc[list(t[temp_key])]

    name_list = list(itertools.chain.from_iterable(te_df['name_list'].values))
    id_list = list(itertools.chain.from_iterable(te_df['id_list'].values))
    description_list = list(itertools.chain.from_iterable(te_df['description_list'].values))

    new_unified_person_df.at[temp_key, 'name_list'] = name_list
    new_unified_person_df.at[temp_key, 'id_list'] = id_list
    new_unified_person_df.at[temp_key, 'description_list'] = description_list

new_unified_person_df = new_unified_person_df.loc[t.keys()]

In [16]:
new_unified_person_df.to_parquet('tables/tables_52_88/new_unified_person_df_final.parquet')