In [1]:
import re
import copy
import glob 
import math
import itertools
import jellyfish
from tqdm import tqdm
import pandas as pd
import numpy as np
from datetime import datetime
import xml.etree.ElementTree as ET
import copy
import itertools
from nltk.tokenize import RegexpTokenizer
import ray

tokenizer = RegexpTokenizer(r'\w+')

# define namespaces in FRUS schema
ns = {'xml': 'http://www.w3.org/XML/1998/namespace',
      'dflt': 'http://www.tei-c.org/ns/1.0',
      'frus': 'http://history.state.gov/frus/ns/1.0',
      'xi': 'http://www.w3.org/2001/XInclude'
      }

# define path to save extracted files
tables_path = 'tables/tables_52_88_demo/'

# only use documents within these years
start_year, end_year = 1952, 1988

In [2]:
def extract_person(item, file):
    volume = file[8:-4]

    persName_item = item.find('.//dflt:persName[@xml:id]', ns)

    if persName_item is not None:

        persName_text = "".join(persName_item.itertext())
        person_id = persName_item.attrib['{http://www.w3.org/XML/1998/namespace}id']

        all_text = "".join(item.itertext())
        end_idx = all_text.find(persName_text) + len(persName_text+',')
        person_descp = " ".join(all_text[end_idx:].split())

        person_name = " ".join(re.sub(',',''," ".join(persName_text.split(', ')[::-1])).split())

        person_id = volume + '_' + person_id

        #global person_df
        #person_df = pd.concat((person_df, pd.DataFrame({'id':[person_id],
        #                                            'name':[person_name],
        #                                            'description':[person_descp]})),ignore_index=True)
    return {'id':person_id,'name':person_name,'description':person_descp}

In [3]:
#person_df = pd.DataFrame(columns=['id','name','description'])
global_person_list = []

no_annotation_cnt = 0

for file in tqdm(glob.glob('volumes/frus*')):
    file_start_year = int(file[12:16])
    
    if file_start_year >= start_year and file_start_year<=end_year:

        tree = ET.parse(file)
        root = tree.getroot()
        persons_section = root.find("./dflt:text/dflt:front//dflt:div[@xml:id='persons']", ns)
        
        if persons_section:
            for item in persons_section.findall('.//dflt:item/dflt:hi/dflt:persName[@xml:id]/../..', ns):
                person_dict = extract_person(item,file)
                global_person_list.append(person_dict) 
            for item in persons_section.findall('.//dflt:item/dflt:persName[@xml:id]/..', ns):
                person_dict = extract_person(item,file)
                global_person_list.append(person_dict) 
        else:
            print(f'No person annotation in {file}.')
            no_annotation_cnt += 1

person_df = pd.DataFrame(global_person_list)
print(f'Not annotated volume count: {no_annotation_cnt}')
print(f'Row count: {len(person_df)}')

  3%|▎         | 14/543 [00:00<00:16, 31.35it/s]

No person annotation in volumes/frus1952-54v01p2.xml.


  9%|▉         | 49/543 [00:03<00:21, 22.63it/s]

No person annotation in volumes/frus1952-54v01p1.xml.


 35%|███▍      | 190/543 [00:12<00:12, 27.70it/s]

No person annotation in volumes/frus1977-80v09.xml.


 79%|███████▉  | 428/543 [00:26<00:08, 13.29it/s]

No person annotation in volumes/frus1952-54v12p1.xml.


 84%|████████▍ | 455/543 [00:29<00:07, 11.36it/s]

No person annotation in volumes/frus1952-54v12p2.xml.


100%|██████████| 543/543 [00:37<00:00, 14.63it/s]

Not annotated volume count: 5
Row count: 48363





#### step 1: reduce exactly matched names

In [4]:
unified_person_dict = {}

def aux(row):
    global unified_person_dict

    if row['name'] in unified_person_dict:
      
      temp_dict = unified_person_dict[row['name']]

      temp_dict['id_list'].append(row['id'])
      temp_dict['description_list'].append(row['description'])
    
    else:
      unified_person_dict[row['name']]= {'id_list':[row['id']],
                                        'description_list':[row['description']]}

    return

person_df.apply(lambda x:aux(x), axis=1)
unified_person_df = pd.DataFrame.from_dict(unified_person_dict,orient='index').reset_index(drop=False)
unified_person_df.rename(columns={'index':'name'}, inplace=True)
print(f'Row count: {len(unified_person_df)}')
print('Step 1 finished.')

Row count: 19352
Step 1 finished.


#### step 2: reduce names with exactly same words but different combinations

In [5]:
unified_person_df['name_set'] = unified_person_df.name.apply(lambda x: " ".join(sorted(x.split())))

new_unified_person_dict = {}

def aux2(row):
    global new_unified_person_dict

    if row['name_set'] in new_unified_person_dict:
      
        temp_dict = new_unified_person_dict[row['name_set']]

        temp_dict['name_list'].append(row['name'])
        temp_dict['id_list'] += row['id_list']
        temp_dict['description_list'] += row['description_list']
    
    else:
        new_unified_person_dict[row['name_set']]= {'name_list':[row['name']],
                                                    'id_list':row['id_list'],
                                                    'description_list':row['description_list']}

    return

unified_person_df.apply(lambda x:aux2(x), axis=1)

new_unified_person_df = pd.DataFrame.from_dict(new_unified_person_dict,orient='index').reset_index(drop=False)
new_unified_person_df.rename(columns={'index':'name_set'}, inplace=True)
print(f'Row count: {len(new_unified_person_df)}')
print('Step 2 finished.')

Row count: 17633
Step 2 finished.


In [6]:
# just for observation
new_unified_person_df[new_unified_person_df['name_list'].apply(lambda x: len(x)==2)]

Unnamed: 0,name_set,name_list,id_list,description_list
0,Acheson Dean,"[Dean Acheson, Acheson Dean]","[frus1964-68v03_p_AD1, frus1969-76v38p1_p_AD_1...","[Secretary of State from 1949 until 1953, Secr..."
9,Boggs Hale,"[Hale Boggs, Boggs Hale]","[frus1964-68v03_p_BH1, frus1964-68v02_p_BH1, f...","[Democratic Representative from Louisiana, Dem..."
14,Brown Harold,"[Harold Brown, Brown Harold]","[frus1964-68v03_p_BH2, frus1964-68v02_p_BH2, f...","[Director, Defense Research and Engineering, D..."
17,Bui Diem,"[Bui Diem, Diem Bui]","[frus1964-68v03_p_BD1, frus1969-76v14_p_BD5, f...",[Vietnamese Chief of Staff in the Quat governm...
20,Bundy McGeorge,"[Bundy McGeorge, McGeorge Bundy]","[frus1964-68v03_p_BMG1, frus1961-63v11_p_BMG2,...",[President’s Special Assistant for National Se...
...,...,...,...,...
16876,Aidit Dipa Nusantara,"[Aidit Dipa Nusantara, Dipa Nusantara Aidit]","[frus1961-63v23_p_ADN1, frus1964-68v26_p_ADN1]",[leader of the PKI (Partai Komunis Indonesia/I...
16996,Cater Douglass Jr. S.,"[S. Douglass Jr. Cater, Jr. S. Douglass Cater]","[frus1964-68v31_p_CSDJ1, frus1964-68v32_p_CSDJ1]",[Special Assistant to the President July 1965–...
17040,J. Jr. Miguel Moreno,"[Jr. Miguel J. Moreno, Miguel J. Jr. Moreno]","[frus1964-68v31_p_MMJJ1, frus1958-60v05_p_MMJJ1]",[Panamanian Representative to the Council of t...
17198,Henderson Loy,"[Henderson Loy, Loy Henderson]","[frus1958-60v11_p_HLW1, frus1958-60v05_p_HL1]",[Deputy Under Secretary of State for Administr...


#### step 3: find and reduce near-duplicate names + obvious misspellings

In [6]:
all_names = new_unified_person_df['name_set'].values

def compute_sim(s1,func,s2):
    return func(s1,s2)

def compute_exact_word_overlap(s1,s2):
    l1 = set([x for x in list(set(tokenizer.tokenize(s1))) if len(x)>=3])
    l2 = set([x for x in list(set(tokenizer.tokenize(s2))) if len(x)>=3])

    return len(l1.intersection(l2))

@ray.remote
def find_matches(idx):
    s2 = all_names[idx]
    
    spiro_dist_df = pd.DataFrame({'name_set':all_names,
                                'overlap_cnt':[compute_exact_word_overlap(x,s2) for x in all_names],
                                'dam_lev_dist':[compute_sim(x, jellyfish.damerau_levenshtein_distance,s2) for x in all_names],
                                'jaro_sim':[compute_sim(x, jellyfish.jaro_winkler_similarity,s2) for x in all_names]})
    
    # misspelling check - hyperparameter
    misspelling_idx = set(spiro_dist_df[(spiro_dist_df['dam_lev_dist'] <=1)].index.values)

    # near-duplication check - hyperparameter
    spiro_dist_df = spiro_dist_df[spiro_dist_df['overlap_cnt']>=2]
    match_idx = set(spiro_dist_df[(spiro_dist_df['jaro_sim'] >= 0.9) | (spiro_dist_df['dam_lev_dist'] <=5)].index.values)

    return match_idx.union(misspelling_idx)

In [7]:
# name : matched names dict
ray.init(num_cpus=13)

futures = [find_matches.remote(idx) for idx in range(len(all_names))]
result_tuple_list = ray.get(futures)
ray.shutdown()

t = {}
for idx in range(len(all_names)):
    t[idx]=result_tuple_list[idx]


# code to merge matches
# finds friend of friend is friend!
scratch_t = copy.deepcopy(t)
changed_flag = True

while changed_flag:

    changed_flag = False

    for key in t:
        
        for matched_idx in t[key]:

            if key != matched_idx:
                if scratch_t.get(key, None) and scratch_t.get(matched_idx, None):
                    changed_flag = True
                    t[key] = t[key].union(t[matched_idx])
                    scratch_t.pop(matched_idx, None)
        
    unwanted = set(t.keys()) - set(scratch_t.keys())
    print(f'removing {len(unwanted)} keys.')
    for unwanted_key in unwanted: del t[unwanted_key]
    scratch_t = copy.deepcopy(t)
    print('---')
    
# reduce matched names into single entry
for temp_key in t:
    
    te_df = new_unified_person_df.iloc[list(t[temp_key])]

    name_list = list(itertools.chain.from_iterable(te_df['name_list'].values))
    id_list = list(itertools.chain.from_iterable(te_df['id_list'].values))
    description_list = list(itertools.chain.from_iterable(te_df['description_list'].values))

    new_unified_person_df.at[temp_key, 'name_list'] = name_list
    new_unified_person_df.at[temp_key, 'id_list'] = id_list
    new_unified_person_df.at[temp_key, 'description_list'] = description_list

new_unified_person_df = new_unified_person_df.loc[t.keys()]

# save unified person table
new_unified_person_df.to_parquet(tables_path+'unified_person_df_step3.parquet')
print(f'Row count: {len(new_unified_person_df)}')
print('Step 3 finished.')

In [13]:
# just for observation
new_unified_person_df[new_unified_person_df['name_list'].apply(lambda x: len(x)>=2)]['name_list'].sample(10).values

array([list(['McKisson Robert', 'Robert M. McKisson']),
       list(['Luigi Einaudi', 'Einaudi Luigi', 'Luigi R. Einaudi', 'Luigi R. Einaui']),
       list(['Harry Bergold', 'Jr. Harry E Bergold']),
       list(['Nuseibeh Anwar', 'Anwar Nuseibeh']),
       list(['Armand Berard', 'Berard Armand', 'Bérard Armand', 'Armand Bérard']),
       list(['Dennis B. Ross', 'Dennis Ross']),
       list(['Ali Mohammad', 'Mohammad Ali', 'Ali Mohammad Khan', 'Mohammed Ali', 'Ali Nasser Muhammad', 'Mohammad Ali Wardhana', 'Muhammad Ali', 'Mohammed Ali Rajai', 'Sher Ali Mohammad']),
       list(['Ilter Turkmen', 'Ilter Türkmen']),
       list(['Leslie H. Brown', 'Brown Leslie']),
       list(['Franco Mario Malfatti', 'Franco M. Malfatti'])],
      dtype=object)

#### step 4: find each person's wikidata entity

In [30]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

from tqdm import tqdm
tqdm.pandas()

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

user_agent = 'CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)'

sparqlwd = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)
sparqlwd.setReturnFormat(JSON)

In [31]:
def find_wiki_entity(name):

    try:
        query = """
        SELECT ?item WHERE {
        SERVICE wikibase:mwapi {
            bd:serviceParam wikibase:endpoint "www.wikidata.org";
                            wikibase:api "EntitySearch";
                            mwapi:search  \'"""+name+"""\';
                            mwapi:language "en".
            ?item wikibase:apiOutputItem mwapi:item.
            ?num wikibase:apiOrdinal true.
        }
        ?item wdt:P31 wd:Q5
        }
        """
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {name}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}

@ray.remote
def process_name_list(name_list):
    
    ssl._create_default_https_context = ssl._create_unverified_context

    wiki_tag = set()

    for name in name_list:
        res = find_wiki_entity(name)

        for binding in res['results']['bindings']:
            wiki_tag.add(binding['item']['value'])

    return list(wiki_tag)

In [32]:
ray.init(num_cpus=13)
new_unified_person_df = pd.read_parquet(tables_path+'unified_person_df_step3.parquet')
futures = [process_name_list.remote(row) for row in new_unified_person_df['name_list'].values]
wiki_col = ray.get(futures)
ray.shutdown()

new_unified_person_df['wiki_col'] = wiki_col
new_unified_person_df.to_parquet(tables_path+'unified_person_df_step4.parquet')
print(f'Row count: {len(new_unified_person_df)}')
print('Step 4 finished.')

[2m[36m(process_name_list pid=4380)[0m name: Lucius D. Clay
[2m[36m(process_name_list pid=4380)[0m error message: HTTP Error 429: Too Many Requests
[2m[36m(process_name_list pid=4384)[0m name: Washington LaRae
[2m[36m(process_name_list pid=4384)[0m error message: HTTP Error 429: Too Many Requests
[2m[36m(process_name_list pid=4376)[0m name: Sirimavo Bandaranaike
[2m[36m(process_name_list pid=4376)[0m error message: HTTP Error 429: Too Many Requests
[2m[36m(process_name_list pid=4374)[0m name: Hernandez Acosta Valentin
[2m[36m(process_name_list pid=4374)[0m error message: HTTP Error 429: Too Many Requests
[2m[36m(process_name_list pid=4381)[0m name: Beryl W. Sprinkel
[2m[36m(process_name_list pid=4381)[0m error message: HTTP Error 429: Too Many Requests
[2m[36m(process_name_list pid=4380)[0m name: Basilio Lami Dozo
[2m[36m(process_name_list pid=4380)[0m error message: HTTP Error 429: Too Many Requests
[2m[36m(process_name_list pid=4382)[0m name: Bo

#### step 5: reduce multiple candidate wikidata entities to single using sbert for each person, if exists

In [57]:
import numpy as np
from sentence_transformers import SentenceTransformer,util

model = SentenceTransformer('all-MiniLM-L6-v2')

In [58]:
# helpers for using sbert for deciding among wikidata entries
def get_entity_descp(Q):

    try:
        query = """
        SELECT ?descp
        WHERE 
        {
        wd:"""+Q+""" schema:description ?descp.
        FILTER ( lang(?descp) = "en" )
        }"""
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {Q}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_candidate_entities(row):

    q_list = row['wiki_col']
    
    wiki_descp = []

    for q in q_list:
        
        res = get_entity_descp(q.split('/')[-1])
        
        if len(res['results']['bindings'])==0:
            wiki_descp.append('')
        else:      
            for binding in res['results']['bindings']:

                wiki_descp.append(binding['descp']['value'])

    return wiki_descp

In [59]:
@ray.remote
def process_wiki_col(row):
    ssl._create_default_https_context = ssl._create_unverified_context

    wiki_col = row['wiki_col']
    
    if len(wiki_col)==0:
        return None

    elif len(wiki_col)==1:
        return wiki_col[0]

    else:
        desc_list = row['description_list']
        frus_embedding = np.mean(model.encode(desc_list), axis=0)

        wiki_descs = process_candidate_entities(row)
        wiki_embeddings = model.encode(wiki_descs)

        cos_sim = util.cos_sim(frus_embedding, wiki_embeddings)

        selected_idx = np.argmax(cos_sim,axis=1)[0]
        
        return row["wiki_col"][selected_idx]

In [60]:
ray.init(num_cpus=8)
new_unified_person_df_wikicol = pd.read_parquet(tables_path+'unified_person_df_step4.parquet')

futures = [process_wiki_col.remote(row) for _,row in new_unified_person_df_wikicol.iterrows()]
selected_wiki_entity = ray.get(futures)
ray.shutdown()

new_unified_person_df['selected_wiki_entity'] = selected_wiki_entity
new_unified_person_df.to_parquet(tables_path+'unified_person_df_step5.parquet')
print(f'Row count: {len(new_unified_person_df)}')
print('Step 5 finished.')



[2m[36m(process_wiki_col pid=4599)[0m name: Q1465644
[2m[36m(process_wiki_col pid=4599)[0m error message: HTTP Error 429: Too Many Requests


#### step 6: reduce names with exactly same wikidata entries

In [65]:
new_unified_person_df = pd.read_parquet(tables_path+'unified_person_df_step5.parquet')

t = {}

for idx, key in new_unified_person_df.iterrows():

    ent = key['selected_wiki_entity']

    if not ent:
        t[idx]=set([idx])
    else:
        t[idx]=set(new_unified_person_df[new_unified_person_df['selected_wiki_entity']==ent].index)


scratch_t = copy.deepcopy(t)
changed_flag = True

while changed_flag:

    changed_flag = False

    for key in t:
        
        for matched_idx in t[key]:

            if key != matched_idx:
                if scratch_t.get(key, None) and scratch_t.get(matched_idx, None):
                    changed_flag = True
                    t[key] = t[key].union(t[matched_idx])
                    scratch_t.pop(matched_idx, None)
        
    unwanted = set(t.keys()) - set(scratch_t.keys())
    print(f'removing {len(unwanted)} keys.')
    for unwanted_key in unwanted: del t[unwanted_key]
    scratch_t = copy.deepcopy(t)
    print('---')

for temp_key in t:
    
    te_df = new_unified_person_df.loc[list(t[temp_key])]

    name_list = list(itertools.chain.from_iterable(te_df['name_list'].values))
    id_list = list(itertools.chain.from_iterable(te_df['id_list'].values))
    description_list = list(itertools.chain.from_iterable(te_df['description_list'].values))

    new_unified_person_df.at[temp_key, 'name_list'] = name_list
    new_unified_person_df.at[temp_key, 'id_list'] = id_list
    new_unified_person_df.at[temp_key, 'description_list'] = description_list

new_unified_person_df = new_unified_person_df.loc[t.keys()]

new_unified_person_df.to_parquet(tables_path+'unified_person_df_final.parquet')
print(f'Row count: {len(new_unified_person_df)}')
print('Step 6 finished.')

removing 241 keys.
---
removing 0 keys.
---
Row count: 13076
Step 6 finished.
