In [1]:
import re
import copy 
import math
import itertools
import jellyfish
from tqdm import tqdm
import pandas as pd
import numpy as np
from datetime import datetime
import xml.etree.ElementTree as ET

ns = {'xml': 'http://www.w3.org/XML/1998/namespace',
      'dflt': 'http://www.tei-c.org/ns/1.0',
      'frus':'http://history.state.gov/frus/ns/1.0',
      'xi':'http://www.w3.org/2001/XInclude'
      }

In [2]:
def extract_person(item, file):
    volume = file[8:-4]

    persName_item = item.find('.//dflt:persName[@xml:id]', ns)

    if persName_item is not None:

        persName_text = "".join(persName_item.itertext())
        person_id = persName_item.attrib['{http://www.w3.org/XML/1998/namespace}id']

        all_text = "".join(item.itertext())
        end_idx = all_text.find(persName_text) + len(persName_text+',')
        person_descp = " ".join(all_text[end_idx:].split())

        person_name = " ".join(re.sub(',',''," ".join(persName_text.split(', ')[::-1])).split())

        person_id = volume + '_' + person_id

        global person_df
        person_df = pd.concat((person_df, pd.DataFrame({'id':[person_id],
                                                    'name':[person_name],
                                                    'description':[person_descp]})),ignore_index=True)
    return

In [3]:
import glob
volume_root = 'frus1969-76'

person_df = pd.DataFrame(columns=['id','name','description'])


for file in glob.glob('volumes/'+volume_root+'*'):
#for file in glob.glob('volumes/frus1969-76v30.xml'):

    tree = ET.parse(file)
    root = tree.getroot()
    persons_section = root.find("./dflt:text/dflt:front//dflt:div[@xml:id='persons']", ns)
    print(file)
    for item in persons_section.findall('.//dflt:item/dflt:hi/dflt:persName[@xml:id]/../..', ns):
        extract_person(item,file)
    for item in persons_section.findall('.//dflt:item/dflt:persName[@xml:id]/..', ns):
        extract_person(item,file)
    print('---')


volumes/frus1969-76v14.xml
---
volumes/frus1969-76v28.xml
---
volumes/frus1969-76v29.xml
---
volumes/frus1969-76v15.xml
---
volumes/frus1969-76v01.xml
---
volumes/frus1969-76v38p1.xml
---
volumes/frus1969-76v17.xml
---
volumes/frus1969-76v03.xml
---
volumes/frus1969-76v02.xml
---
volumes/frus1969-76v16.xml
---
volumes/frus1969-76v38p2.xml
---
volumes/frus1969-76ve15p2Ed2.xml
---
volumes/frus1969-76v12.xml
---
volumes/frus1969-76v06.xml
---
volumes/frus1969-76v07.xml
---
volumes/frus1969-76v13.xml
---
volumes/frus1969-76v39.xml
---
volumes/frus1969-76v05.xml
---
volumes/frus1969-76v11.xml
---
volumes/frus1969-76v10.xml
---
volumes/frus1969-76v04.xml
---
volumes/frus1969-76ve05p2.xml
---
volumes/frus1969-76v19p1.xml
---
volumes/frus1969-76ve05p1.xml
---
volumes/frus1969-76v19p2.xml
---
volumes/frus1969-76ve08.xml
---
volumes/frus1969-76ve09p1.xml
---
volumes/frus1969-76v42.xml
---
volumes/frus1969-76ve04.xml
---
volumes/frus1969-76ve10.xml
---
volumes/frus1969-76ve06.xml
---
volumes/frus

#### step 1: reduce exactly matched names

In [4]:
unified_person_dict = {}

In [5]:
def aux(row):
    global unified_person_dict

    if row['name'] in unified_person_dict:
      
      temp_dict = unified_person_dict[row['name']]

      temp_dict['id_list'].append(row['id'])
      temp_dict['description_list'].append(row['description'])
    
    else:
      unified_person_dict[row['name']]= {'id_list':[row['id']],
                                        'description_list':[row['description']]}

    return


In [6]:
person_df.apply(lambda x:aux(x), axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
14035    None
14036    None
14037    None
14038    None
14039    None
Length: 14040, dtype: object

In [7]:
unified_person_df = pd.DataFrame.from_dict(unified_person_dict,orient='index').reset_index(drop=False)
unified_person_df.rename(columns={'index':'name'}, inplace=True)

#### step 2: reduce names with exactly same words but different combinations

In [8]:
unified_person_df['name_set'] = unified_person_df.name.apply(lambda x: " ".join(sorted(x.split())))

In [9]:
new_unified_person_dict = {}

def aux2(row):
    global new_unified_person_dict

    if row['name_set'] in new_unified_person_dict:
      
        temp_dict = new_unified_person_dict[row['name_set']]

        temp_dict['name_list'].append(row['name'])
        temp_dict['id_list'] += row['id_list']
        temp_dict['description_list'] += row['description_list']
    
    else:
        new_unified_person_dict[row['name_set']]= {'name_list':[row['name']],
                                                    'id_list':row['id_list'],
                                                    'description_list':row['description_list']}

    return

In [10]:
unified_person_df.apply(lambda x:aux2(x), axis=1)

new_unified_person_df = pd.DataFrame.from_dict(new_unified_person_dict,orient='index').reset_index(drop=False)
new_unified_person_df.rename(columns={'index':'name'}, inplace=True)

In [None]:
#pd.set_option('display.max_colwidth', None)

In [11]:
new_unified_person_df[new_unified_person_df['name_list'].apply(lambda x: len(x)==2)]

Unnamed: 0,name,name_list,id_list,description_list
13,Bui Diem,"[Bui Diem, Diem Bui]","[frus1969-76v14_p_BD5, frus1969-76v06_p_BD1, f...",[South Vietnamese Ambassador to the United Sta...
14,Bunker Ellsworth,"[Ellsworth Bunker, Bunker Ellsworth]","[frus1969-76v14_p_BE6, frus1969-76v38p2_p_BE_2...","[Ambassador to South Vietnam, Ambassador to th..."
18,Castro Fidel Ruz,"[Fidel Castro Ruz, Castro Ruz Fidel]","[frus1969-76v14_p_CRF1, frus1969-76v38p1_p_CRF...","[Premier of Cuba, Premier of Cuba, Premier of ..."
19,Chancellor John,"[John Chancellor, Chancellor John]","[frus1969-76v14_p_CJ8, frus1969-76v13_p_CJ1]","[anchor on the NBC Nightly News, anchor on NBC..."
23,B. Connally John Jr.,"[Jr. John B. Connally, John B. Jr. Connally]","[frus1969-76v14_p_CJBJ1, frus1969-76v28_p_CJB_...","[Secretary of the Treasury until May 16, 1972,..."
...,...,...,...,...
5393,Wendell Wyatt,"[Wendell Wyatt, Wyatt Wendell]","[frus1969-76v37_p_WW_1, frus1969-76v27_p_WW_1]","[member, U.S. House of Representatives (R–Oreg..."
5588,(Korniyenko) Georgi Kornienko M.,"[Kornienko (Korniyenko) Georgi M., Georgi M. K...","[frus1969-76v33_p_KGM_1, frus1969-76v32_p_KGM1]","[Director, United States of America Department..."
5599,Aleksandr Shchukin,"[Shchukin Aleksandr, Aleksandr Shchukin]","[frus1969-76v33_p_SA_1, frus1969-76v32_p_SA1]","[member of the Soviet SALT Delegation, Soviet ..."
5655,Niehuss Rosemary,"[Niehuss Rosemary, Rosemary Niehuss]","[frus1969-76v27_p_NR_1, frus1969-76v30_p_NR5]","[member, National Security Council Staff, Memb..."


#### step 3: find and reduce near-duplicate names

In [None]:
# step one: (match len>=2 and each word len>=3)
# step two: for remaning unmatched, allow edit distance of 1 or 2 for misspellings

# caution!!!
# Eliot Jr. L. Theodore, and D. Dwight Eisenhower
# Georges Guay R. vs George Guay R.
# Abrams Creighton General Major W.
# Aharon General Major Yariv

In [12]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')



In [13]:
all_names = new_unified_person_df['name'].values

def compute_sim(s1,func,s2):
    return func(s1,s2)

def compute_exact_word_overlap(s1,s2):
    l1 = set([x for x in list(set(tokenizer.tokenize(s1))) if len(x)>=3])
    l2 = set([x for x in list(set(tokenizer.tokenize(s2))) if len(x)>=3])

    return len(l1.intersection(l2))

def find_matches(s2):

    spiro_dist_df = pd.DataFrame({'name':all_names,
                                'overlap_cnt':[compute_exact_word_overlap(x,s2) for x in all_names],
                                'dam_lev_dist':[compute_sim(x, jellyfish.damerau_levenshtein_distance,s2) for x in all_names],
                                'jaro_sim':[compute_sim(x, jellyfish.jaro_winkler_similarity,s2) for x in all_names]})
    
    spiro_dist_df = spiro_dist_df[spiro_dist_df['overlap_cnt']>=2]
    match_idx = set(spiro_dist_df[(spiro_dist_df['jaro_sim'] >= 0.9) | (spiro_dist_df['dam_lev_dist'] <=5)].index.values)

    return match_idx

In [14]:
t = {}
for idx in tqdm(range(len(all_names))):
    name = all_names[idx]
    t[idx]=find_matches(name)

100%|██████████| 6042/6042 [04:52<00:00, 20.63it/s]


In [15]:
scratch_t = copy.deepcopy(t)
changed_flag = True

while changed_flag:

    changed_flag = False

    for key in t:
        
        for matched_idx in t[key]:

            if key != matched_idx:
                if scratch_t.get(key, None) and scratch_t.get(matched_idx, None):
                    changed_flag = True
                    t[key] = t[key].union(t[matched_idx])
                    scratch_t.pop(matched_idx, None)
        
    unwanted = set(t.keys()) - set(scratch_t.keys())
    print(f'removing {len(unwanted)} keys.')
    for unwanted_key in unwanted: del t[unwanted_key]
    scratch_t = copy.deepcopy(t)
    print('---')
    

removing 1047 keys.
---
removing 90 keys.
---
removing 0 keys.
---


In [16]:
for temp_key in t:
    
    te_df = new_unified_person_df.iloc[list(t[temp_key])]

    name_list = list(itertools.chain.from_iterable(te_df['name_list'].values))
    id_list = list(itertools.chain.from_iterable(te_df['id_list'].values))
    description_list = list(itertools.chain.from_iterable(te_df['description_list'].values))

    new_unified_person_df.at[temp_key, 'name_list'] = name_list
    new_unified_person_df.at[temp_key, 'id_list'] = id_list
    new_unified_person_df.at[temp_key, 'description_list'] = description_list

new_unified_person_df = new_unified_person_df.loc[t.keys()]

In [None]:
new_unified_person_df.sample(10)