In [1]:
import re
import copy
import glob 
import math
import itertools
import jellyfish
from tqdm import tqdm
import pandas as pd
import numpy as np
from datetime import datetime
import xml.etree.ElementTree as ET

ns = {'xml': 'http://www.w3.org/XML/1998/namespace',
      'dflt': 'http://www.tei-c.org/ns/1.0',
      'frus':'http://history.state.gov/frus/ns/1.0',
      'xi':'http://www.w3.org/2001/XInclude'
      }

In [2]:
def extract_institution(item, file):
    volume = file[8:-4]

    term_item = item.find('.//dflt:term[@xml:id]', ns)

    if term_item is not None:

        term_text = "".join(term_item.itertext())
        term_id = term_item.attrib['{http://www.w3.org/XML/1998/namespace}id']

        all_text = "".join(item.itertext())
        end_idx = all_text.find(term_text) + len(term_text+',')
        item_descp = " ".join(all_text[end_idx:].split())

        term_name = " ".join(re.sub(',',''," ".join(term_text.split(', ')[::-1])).split())

        term_id = volume + '_' + term_id

        global institution_df
        institution_df = pd.concat((institution_df, pd.DataFrame({'id':[term_id],
                                                    'name':[term_name],
                                                    'description':[item_descp.lower()]})),ignore_index=True)
    return

In [3]:
start_year, end_year = 1952, 1988

institution_df = pd.DataFrame(columns=['id','name','description'])

for file in tqdm(glob.glob('volumes/frus*')):

    file_start_year = int(file[12:16])
    
    if file_start_year >= start_year and file_start_year<=end_year:

        tree = ET.parse(file)
        root = tree.getroot()
        terms_section = root.find("./dflt:text/dflt:front//dflt:div[@xml:id='terms']", ns)
        
        if terms_section:
            for item in terms_section.findall('.//dflt:item/dflt:hi/dflt:term[@xml:id]/../..', ns):
                extract_institution(item,file)
            for item in terms_section.findall('.//dflt:item/dflt:term[@xml:id]/..', ns):
                extract_institution(item,file)
        else:
            print(f'No institution annotation in {file}.')

 35%|███▍      | 190/543 [00:18<00:19, 18.50it/s]

No institution annotation in volumes/frus1977-80v09.xml.


100%|██████████| 543/543 [01:24<00:00,  6.40it/s]


In [4]:
institution_df.sample(5)

Unnamed: 0,id,name,description
18005,frus1969-76ve05p2_t_NLC1,NLC,national liberation council (ghana)
36495,frus1977-80v09Ed2_t_SCC_1,SCC,special coordination committee
2766,frus1961-63v10_t_WT1,W/T,wireless transmitter
41746,frus1952-54v12p2_t_SEAP1,SEAP,letters standing for “southeast asia pact” wer...
6831,frus1964-68v11_t_OAS1,OAS,organization of american states


#### step 1: reduce exactly matched institution descriptions

In [5]:
unified_term_dict = {}

In [6]:
def aux(row):
    global unified_term_dict

    if row['description'] in unified_term_dict:
      
      temp_dict = unified_term_dict[row['description']]

      temp_dict['id_list'].append(row['id'])
      temp_dict['name_list'].append(row['name'])
    
    else:
      unified_term_dict[row['description']]= {'id_list':[row['id']],
                                            'name_list':[row['name']]}

    return

In [7]:
institution_df.apply(lambda x:aux(x), axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
50987    None
50988    None
50989    None
50990    None
50991    None
Length: 50992, dtype: object

In [8]:
unified_institution_df = pd.DataFrame.from_dict(unified_term_dict,orient='index').reset_index(drop=False)
unified_institution_df.rename(columns={'index':'description'}, inplace=True)

In [16]:
unified_institution_df.sample(5)

Unnamed: 0,description,id_list,name_list,description_set
6753,section française d’internationale ouvrière,[frus1955-57v13_t_SFIO1],[SFIO],d’internationale française ouvrière section
1242,united nations commission on narcotic drugs,"[frus1969-76v29_t_UNCD1, frus1969-76ve07_t_CND...","[UNCD, CND, CND, CND]",commission drugs narcotic nations on united
11842,"commander-in-chief, united states army, pacific","[frus1964-68v29p2_t_CINCUSARPAC1, frus1964-68v...","[CINCUSARPAC, CINCUSARPAC]","army, commander-in-chief, pacific states united"
5010,dissemination and extraction of information co...,"[frus1977-80v23_t_Orcon_1, frus1969-76ve12_t_o...","[Orcon, Orcon]",(dissemination abbreviation) and by control co...
5810,general services office,[frus1977-80v19_t_GSO_1],[GSO],general office services


#### step 2: reduce descriptions with exactly same words but different combinations

In [15]:
unified_institution_df['description_set'] = unified_institution_df.description.apply(lambda x: " ".join(sorted(x.split())))

In [17]:
new_unified_institution_dict = {}

def aux2(row):
    global new_unified_institution_dict

    if row['description_set'] in new_unified_institution_dict:
      
        temp_dict = new_unified_institution_dict[row['description_set']]

        temp_dict['description_list'].append(row['description'])
        temp_dict['id_list'] += row['id_list']
        temp_dict['name_list'] += row['name_list']
    
    else:
        new_unified_institution_dict[row['description_set']]= {'description_list':[row['description']],
                                                                'id_list':row['id_list'],
                                                                'name_list':row['name_list']}

    return

In [18]:
unified_institution_df.apply(lambda x:aux2(x), axis=1)

new_unified_institution_df = pd.DataFrame.from_dict(new_unified_institution_dict,orient='index').reset_index(drop=False)
new_unified_institution_df.rename(columns={'index':'description_set'}, inplace=True)

In [25]:
# just for observation
new_unified_institution_df[new_unified_institution_df['description_list'].apply(lambda x: len(x)==2)]

Unnamed: 0,description_set,description_list,id_list,name_list
63,(north) democratic of republic vietnam,"[democratic republic of (north) vietnam, democ...","[frus1964-68v03_t_DRV1, frus1964-68v02_t_DRV1,...","[DRV, DRV, DRV or DRVN, DRV (also DRVN), DRV, ..."
83,(south) government of vietnam,"[government of (south) vietnam, government of ...","[frus1964-68v03_t_GVN1, frus1969-76v14_t_GVN1,...","[GVN, GVN, GVN, GVN, GVN, GVN, GVN, GVN, GVN, ..."
99,"armed forces general joint staff, vietnamese","[joint general staff, vietnamese armed forces,...","[frus1964-68v03_t_JGS1, frus1964-68v02_t_JGS1,...","[JGS, JGS, JGS, JGS, JGS, JGS, JGS]"
242,assistance council economic for mutual,"[council for mutual economic assistance, counc...","[frus1964-68v17_t_CEMA1, frus1964-68v17_t_COME...","[CEMA, COMECON, CEMA, CEMA, CEMA, CMEA, COMECO..."
306,"affairs, affairs, bureau department european o...","[office of soviet union affairs, bureau of eur...","[frus1964-68v17_t_SOV1, frus1961-63v05_t_EURSO...","[SOV, EUR/SOV, EUR/SOV, EUR/SOV, EUR/SOV, EUR/..."
...,...,...,...,...
11930,abroad administration cooperation for from ind...,[series indicator for telegrams to the interna...,"[frus1958-60v15_t_Toica1, frus1958-60v13_t_Toi...","[Toica, Toica, Icato]"
12114,department of outgoing state telegram,"[outgoing department of state telegram, depart...","[frus1964-68v21_t_Deptel1, frus1964-68v18_t_De...","[Deptel, Deptel, Deptel]"
12633,"department executive of of office secretariat,...","[executive secretariat, office of the secretar...","[frus1969-76v37_t_SS_1, frus1969-76v24_t_SS1]","[S/S, S/S]"
13297,group group; review review senior special,"[senior review group; special review group, sp...","[frus1969-76v26_t_SRG_1, frus1969-76v24_t_SRG1]","[SRG, SRG]"


#### step 3: find and reduce near-duplicate names + obvious misspellings

In [None]:
# step one: (match len>=2 and each word len>=3), edit distance based matching
# step two: find misspelling matches with edit distance of 1 e.g. for Ziegler vs Zeigler

# caution!!!
# Eliot Jr. L. Theodore, and D. Dwight Eisenhower
# Georges Guay R. vs George Guay R.
# Abrams Creighton General Major W.
# Aharon General Major Yariv

In [30]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [36]:
all_descriptions = new_unified_institution_df['description_set'].values

def compute_sim(s1,func,s2):
    return func(s1,s2)

def compute_exact_word_overlap(s1,s2):
    l1 = set([x for x in list(set(tokenizer.tokenize(s1))) if len(x)>=3])
    l2 = set([x for x in list(set(tokenizer.tokenize(s2))) if len(x)>=3])

    return len(l1.intersection(l2))

def find_matches(s2):

    spiro_dist_df = pd.DataFrame({'description_set':all_descriptions,
                                #'overlap_cnt':[compute_exact_word_overlap(x,s2) for x in all_names],
                                'dam_lev_dist':[compute_sim(x, jellyfish.damerau_levenshtein_distance,s2) for x in all_descriptions],
                                'jaro_sim':[compute_sim(x, jellyfish.jaro_winkler_similarity,s2) for x in all_descriptions]})
    
    # addition to original matching criteria # DO THIS 2.
    misspelling_idx = set(spiro_dist_df[(spiro_dist_df['dam_lev_dist'] <=1)].index.values)

    #spiro_dist_df = spiro_dist_df[spiro_dist_df['overlap_cnt']>=2]
    #match_idx = set(spiro_dist_df[(spiro_dist_df['jaro_sim'] >= 0.9) | (spiro_dist_df['dam_lev_dist'] <=5)].index.values)

    #return match_idx.union(misspelling_idx)
    return misspelling_idx

In [37]:
t = {}
for idx in tqdm(range(len(all_descriptions))):
    name = all_descriptions[idx]
    t[idx]=find_matches(name)

100%|██████████| 14157/14157 [50:35<00:00,  4.66it/s] 


In [38]:
scratch_t = copy.deepcopy(t)
changed_flag = True

while changed_flag:

    changed_flag = False

    for key in t:
        
        for matched_idx in t[key]:

            if key != matched_idx:
                if scratch_t.get(key, None) and scratch_t.get(matched_idx, None):
                    changed_flag = True
                    t[key] = t[key].union(t[matched_idx])
                    scratch_t.pop(matched_idx, None)
        
    unwanted = set(t.keys()) - set(scratch_t.keys())
    print(f'removing {len(unwanted)} keys.')
    for unwanted_key in unwanted: del t[unwanted_key]
    scratch_t = copy.deepcopy(t)
    print('---')
    

removing 736 keys.
---
removing 36 keys.
---
removing 0 keys.
---


In [39]:
for temp_key in t:
    
    te_df = new_unified_institution_df.iloc[list(t[temp_key])]

    name_list = list(itertools.chain.from_iterable(te_df['name_list'].values))
    id_list = list(itertools.chain.from_iterable(te_df['id_list'].values))
    description_list = list(itertools.chain.from_iterable(te_df['description_list'].values))

    new_unified_institution_df.at[temp_key, 'name_list'] = name_list
    new_unified_institution_df.at[temp_key, 'id_list'] = id_list
    new_unified_institution_df.at[temp_key, 'description_list'] = description_list

new_unified_institution_df = new_unified_institution_df.loc[t.keys()]

In [42]:
new_unified_institution_df[new_unified_institution_df['description_list'].apply(lambda x: len(x)>=2)].sample(10)

Unnamed: 0,description_set,description_list,id_list,name_list
6856,group nuclear suppliers,"[nuclear suppliers group, nuclear suppliers’ g...","[frus1977-80v26_t_NSG_1, frus1977-80v01_t_NSG_...","[NSG, NSG, NSG]"
4263,"council, nations trusteeship united","[united nations trusteeship council, trusteesh...","[frus1955-57v22_t_TC1, frus1969-76v05_t_TC1, f...","[TC, TC, TC, TC]"
942,a. henry kissinger,"[henry a. kissingrer, henry a. kissinger]","[frus1969-76v27_t_HAK_1, frus1969-76v28_t_HAK_...","[HAK, HAK, HAK, HAK, HAK, HAK, HAK, HAK, HAK, ..."
2325,memorandum secretaries’ under,"[under secretaries memorandum, under secretari...","[frus1969-76v02_t_USM1, frus1969-76ve12_t_u-s-...","[U/SM, U–S/M, U/SM, U/SM, U–S/M, U–S/M, U–S/M,..."
13021,american cooperation cooperative everywhere-me...,[cooperative for american remittances to every...,"[frus1961-63v21_t_CAREMEDICO1, frus1964-68v24_...","[CARE-MEDICO, CARE–MEDICO, CARE–MEDICO]"
1203,(turkey) party peopleʼs republican,"[republican people’s party (turkey), republica...","[frus1964-68v16_t_RPP1, frus1977-80v20_t_RPP_1...","[RPP, RPP, RPP, RPP, RPP, RPP]"
5861,action et french party political républicaine ...,"[action répubicaine et sociale, french politic...","[frus1952-54v13p2_t_ARS1, frus1952-54v06p1_t_A...","[ARS, ARS, ARS, ARS]"
6096,(democratic de democratica mocambique mozambiq...,[uniao democratica nacional de mocambique (dem...,"[frus1969-76ve05p2_t_UDENAMO1, frus1969-76ve05...","[UDENAMO, UDENAMO, UDENAMO, UDENAMO]"
181,missile surface-to-air,"[surface-to-air missile, surface-to-air missiles]","[frus1964-68v03_t_SAM1, frus1961-63v11_t_SAM1,...","[SAM, SAM, SAM, SAM, SAM, SAM, SAM, SAM, SAM, ..."
1341,addressee,"[addressee, addressees]","[frus1961-63v04_t_ADDEE1, frus1961-63v03_t_ADD...","[addee, addee, addee, addee, addees]"


In [47]:
# save unified person table
new_unified_institution_df.to_parquet('tables/tables_52_88/new_unified_institution_df.parquet')

In [62]:
new_unified_institution_df

Unnamed: 0,description_set,description_list,id_list,name_list,wiki_col
0,anti-aircraft,[anti-aircraft],"[frus1964-68v03_t_AA1, frus1964-68v02_t_AA1, f...","[AA, AA, A/A, AA, AA, AA, AA, AA, AA, AA, AA, ...","[http://www.wikidata.org/entity/Q669716, http:..."
1,anti-aircraft artillery,"[anti-aircraft artillery, antiaircraft artillery]","[frus1964-68v03_t_AAA1, frus1961-63v11_t_AAA1,...","[AAA, AAA, AAA, AAA, AAA, AAA, AAA, AAA, AAA, ...","[http://www.wikidata.org/entity/Q7325635, http..."
2,action americans democratic for,[americans for democratic action],"[frus1964-68v03_t_ADA1, frus1964-68v02_t_ADA1,...","[ADA, ADA, ADA, ADA, ADA, ADA, ADA]","[http://www.wikidata.org/entity/Q63910401, htt..."
3,armed council forces,[armed forces council],"[frus1964-68v03_t_AFC1, frus1964-68v02_t_AFC1,...","[AFC, AFC, AFC, AFC, AFC]",[http://www.wikidata.org/entity/Q2993815]
4,american committee friends service,[american friends service committee],"[frus1964-68v03_t_AFSC1, frus1964-68v02_t_AFSC...","[AFSC, AFSC, AFSC, AFSC, AFSC, AFSC, AFSC, AFSC]",[http://www.wikidata.org/entity/Q464677]
...,...,...,...,...,...
14152,(of democratic malaysia) party united,[united democratic party (of malaysia)],[frus1964-68v26_t_UDP1],[UDP],[]
14153,malay national organization united,[united malay national organization],[frus1964-68v26_t_UMNO1],[UMNO],[]
14154,"affairs affairs, bureau international nation o...","[office of united nation political affairs, bu...",[frus1964-68v26_t_UNP1],[UNP],[]
14155,for from indicator missions series telegrams t...,[series indicator for usis telegrams from the ...,[frus1964-68v26_t_USITO1],[USITO],[]


### DONE HERE. DELETE BELOW!!!

#### step 4: find each person's wikidata entity

In [48]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

from tqdm import tqdm
tqdm.pandas()

import ssl
ssl._create_default_https_context = ssl._create_unverified_context


user_agent = 'CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)'

sparqlwd = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)
sparqlwd.setReturnFormat(JSON)

In [49]:
def find_wiki_entity(name):

    try:
        query = """
        SELECT ?item WHERE {
        SERVICE wikibase:mwapi {
            bd:serviceParam wikibase:endpoint "www.wikidata.org";
                            wikibase:api "EntitySearch";
                            mwapi:search  \'"""+name+"""\';
                            mwapi:language "en".
            ?item wikibase:apiOutputItem mwapi:item.
            ?num wikibase:apiOrdinal true.
        }
        }
        """
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {name}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_description_list(row):

    name_list = row['description_list']

    wiki_tag = set()

    for name in name_list:
        res = find_wiki_entity(name)

        for binding in res['results']['bindings']:
            wiki_tag.add(binding['item']['value'])

    return list(wiki_tag)

In [50]:
new_unified_institution_df = pd.read_parquet('tables/tables_52_88/new_unified_institution_df.parquet')
wiki_col = new_unified_institution_df.progress_apply(lambda x: process_description_list(x),axis=1)
new_unified_institution_df['wiki_col'] = wiki_col
new_unified_institution_df.to_parquet('tables/tables_52_88/new_unified_institution_df_wikicol.parquet')

 38%|███▊      | 5073/13385 [28:51<4:14:08,  1.83s/it]

name: moro national liberation front, philippine political organization
error message: EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b'SPARQL-QUERY: queryStr=\n        SELECT ?item WHERE {\n        SERVICE wikibase:mwapi {\n            bd:serviceParam wikibase:endpoint "www.wikidata.org";\n                            wikibase:api "EntitySearch";\n                            mwapi:search  \'moro national liberation front, philippine political organization\';\n                            mwapi:language "en".\n            ?item wikibase:apiOutputItem mwapi:item.\n            ?num wikibase:apiOrdinal true.\n        }\n        }\n        \njava.util.concurrent.ExecutionException: java.util.concurrent.ExecutionException: org.openrdf.query.QueryEvaluationException: java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.lang.Exception: task=ChunkTask{query=47d20a1e-5c26-4742-95b9-d3b3deb1a3f9,bopId=1,partitionId=-1,sinkId=2,altSinkId=null

 86%|████████▌ | 11537/13385 [1:00:44<2:46:54,  5.42s/it]

name: subject indicator for telegrams concerning aid coordination
error message: EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b'SPARQL-QUERY: queryStr=\n        SELECT ?item WHERE {\n        SERVICE wikibase:mwapi {\n            bd:serviceParam wikibase:endpoint "www.wikidata.org";\n                            wikibase:api "EntitySearch";\n                            mwapi:search  \'subject indicator for telegrams concerning aid coordination\';\n                            mwapi:language "en".\n            ?item wikibase:apiOutputItem mwapi:item.\n            ?num wikibase:apiOrdinal true.\n        }\n        }\n        \njava.util.concurrent.ExecutionException: java.util.concurrent.ExecutionException: org.openrdf.query.QueryEvaluationException: java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.lang.Exception: task=ChunkTask{query=16f48282-70c6-46b5-b678-2bbe2b19ba1f,bopId=1,partitionId=-1,sinkId=2,altSinkId=null}, cause=jav

100%|██████████| 13385/13385 [1:09:24<00:00,  3.21it/s]  


In [60]:
new_unified_institution_df[new_unified_institution_df['wiki_col'].apply(lambda x: True if len(x)>50 else False)][['description_list','wiki_col']]

Unnamed: 0,description_list,wiki_col
17,"[buildings, building]","[http://www.wikidata.org/entity/Q107330069, ht..."
82,"[government, governments]","[http://www.wikidata.org/entity/Q8022, http://..."
140,"[operation plan, operations plan]","[http://www.wikidata.org/entity/Q115341251, ht..."
155,"[prime ministers, prime minister]","[http://www.wikidata.org/entity/Q4970706, http..."
158,"[prisoner of war, prisoners of war]","[http://www.wikidata.org/entity/Q65978582, htt..."
163,"[representatives, representative]","[http://www.wikidata.org/entity/Q21855073, htt..."
256,"[east-west, east/west, east–west]","[http://www.wikidata.org/entity/Q110561075, ht..."
387,"[resolution, resolutions]","[http://www.wikidata.org/entity/Q105204184, ht..."
446,"[latin americans, latin american, latin america]","[http://www.wikidata.org/entity/Q15756060, htt..."
461,"[permanent representatives, permanent represen...","[http://www.wikidata.org/entity/Q57956129, htt..."


#### step 5: reduce multiple candidate wikidata entities to single using sbert for each person, if exists

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer,util

model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# helpers for using sbert for deciding among wikidata entries
def get_entity_descp(Q):

    try:
        query = """
        SELECT ?descp
        WHERE 
        {
        wd:"""+Q+""" schema:description ?descp.
        FILTER ( lang(?descp) = "en" )
        }"""
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {Q}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_candidate_entities(row):

    q_list = row['wiki_col']
    
    wiki_descp = []

    for q in q_list:
        
        res = get_entity_descp(q.split('/')[-1])
        
        if len(res['results']['bindings'])==0:
            wiki_descp.append('')
        else:      
            for binding in res['results']['bindings']:

                wiki_descp.append(binding['descp']['value'])

    return wiki_descp

In [None]:
def process_wiki_col(row):

    wiki_col = row['wiki_col']
    
    if len(wiki_col)==0:
        return None

    elif len(wiki_col)==1:
        return wiki_col[0]

    else:
        desc_list = row['description_list']
        frus_embedding = np.mean(model.encode(desc_list), axis=0)

        wiki_descs = process_candidate_entities(row)
        wiki_embeddings = model.encode(wiki_descs)

        cos_sim = util.cos_sim(frus_embedding, wiki_embeddings)

        selected_idx = np.argmax(cos_sim,axis=1)[0]
        
        return row["wiki_col"][selected_idx]

In [None]:
new_unified_person_df_wikicol = pd.read_parquet('tables/tables_52_88/new_unified_person_df_wikicol.parquet')
selected_wiki_entity = new_unified_person_df.progress_apply(lambda x: process_wiki_col(x),axis=1)

new_unified_person_df['selected_wiki_entity'] = selected_wiki_entity
new_unified_person_df.to_parquet('tables/tables_52_88/new_unified_person_df_sbert.parquet')

#### step 6: reduce names with exactly same wikidata entries

In [None]:
new_unified_person_df = pd.read_parquet('tables/tables_52_88/new_unified_person_df_sbert.parquet')

In [None]:
t = {}

for idx, key in new_unified_person_df.iterrows():

    ent = key['selected_wiki_entity']

    if not ent:
        t[idx]=set([idx])
    else:
        t[idx]=set(new_unified_person_df[new_unified_person_df['selected_wiki_entity']==ent].index)

In [None]:
import copy
import itertools

scratch_t = copy.deepcopy(t)
changed_flag = True

while changed_flag:

    changed_flag = False

    for key in t:
        
        for matched_idx in t[key]:

            if key != matched_idx:
                if scratch_t.get(key, None) and scratch_t.get(matched_idx, None):
                    changed_flag = True
                    t[key] = t[key].union(t[matched_idx])
                    scratch_t.pop(matched_idx, None)
        
    unwanted = set(t.keys()) - set(scratch_t.keys())
    print(f'removing {len(unwanted)} keys.')
    for unwanted_key in unwanted: del t[unwanted_key]
    scratch_t = copy.deepcopy(t)
    print('---')

In [None]:
for temp_key in t:
    
    te_df = new_unified_person_df.loc[list(t[temp_key])]

    name_list = list(itertools.chain.from_iterable(te_df['name_list'].values))
    id_list = list(itertools.chain.from_iterable(te_df['id_list'].values))
    description_list = list(itertools.chain.from_iterable(te_df['description_list'].values))

    new_unified_person_df.at[temp_key, 'name_list'] = name_list
    new_unified_person_df.at[temp_key, 'id_list'] = id_list
    new_unified_person_df.at[temp_key, 'description_list'] = description_list

new_unified_person_df = new_unified_person_df.loc[t.keys()]

In [None]:
new_unified_person_df.to_parquet('tables/tables_52_88/new_unified_person_df_final.parquet')