In [1]:
import re
import pandas as pd
from datetime import datetime
import xml.etree.ElementTree as ET

In [2]:
ns = {'xml': 'http://www.w3.org/XML/1998/namespace',
      'dflt': 'http://www.tei-c.org/ns/1.0',
      'frus':'http://history.state.gov/frus/ns/1.0',
      'xi':'http://www.w3.org/2001/XInclude'
      }

In [3]:
volume = 'frus1969-76v30.xml'
tree = ET.parse('volumes/frus1969-76v30.xml')
root = tree.getroot()

In [None]:
# year: frus:doc-dateTime-max
# sent from: <placeName>New York</placeName>
# volume from file name
# from: <gloss type="from"> or <persName type="from">
# to: <gloss type="to"> or <persName type="to">
# source: <... type="source" ..>

# ignore all <note/>

In [20]:
person_df = pd.DataFrame(columns=['id','name','description'])

def extract_person(person_item):
    persName_item = person_item.find('.//dflt:persName', ns)
    person_name = persName_item.text
    person_id = persName_item.attrib['{http://www.w3.org/XML/1998/namespace}id']

    all_text = "".join(person_item.itertext())
    end_idx = all_text.find(person_name) + len(person_name+',')
    person_descp = " ".join(all_text[end_idx:].split())

    person_name = " ".join(re.sub(',',''," ".join(person_name.split(', ')[::-1])).split())

    #person_id = volume[:-4] + '_' + person_id

    global person_df
    person_df = pd.concat((person_df, pd.DataFrame({'id':[person_id],
                                                'name':[person_name],
                                                'description':[person_descp]})),ignore_index=True)
    return


persons_section = root.find("./dflt:text/dflt:front//dflt:div[@xml:id='persons']", ns)
for item in persons_section.findall('.//dflt:item', ns):
    extract_person(item)

In [21]:
person_df

Unnamed: 0,id,name,description
0,p_AST1,Spiro T. Agnew,"Vice President until October 10, 1973"
1,p_AC8,Carl Albert,Democratic Congressman from Oklahoma; Speaker ...
2,p_AM4,Menelaos Alexandrakis,Greek Ambassador to the United States from Aug...
3,p_ARB3,Robert Anderson,Department of State spokesman
4,p_AA3,Adamantios Androutsopoulos,Greek Prime Minister from November 1973 until ...
...,...,...,...
147,p_WCW2,Charles W. Whalen,Republican Congressman from Ohio
148,p_WJH2,Harold Wilson,British Prime Minister until 976
149,p_XPC1,Xanthopoulos-Palamas Christos,Greek Foreign Minister from January until Nove...
150,p_ZGD1,Gen. Dimitrios Zagorianakos,Commander-in-Chief of the Hellenic Armed Force...


## done above ^
## construction below 

In [None]:
# this function handles file's sender and receiver
def process_from_to_types(from_list):

    processed_list = []

    for item in from_list:
        
        if item.tag == 'persName' and 'corresp' in item.attrib.keys():
            person_id = item.attrib['corresp']
            processed_list.append(person_id)
        elif item.tag == 'persName':
            processed_list.append(item.text)
        elif item.tag == 'gloss':
            processed_list.append(item.text)
        else:
            raise NotImplementedError('from type tag unidentified')
    
    return processed_list

In [None]:
subseries_id = root.find('teiHeader/fileDesc/titleStmt/title[@type="sub-series"]').text
volume_id = root.find('teiHeader/fileDesc/titleStmt/title[@type="volume-number"]').text

doc_dict = {}

def process_document(doc_elem):

    doc_id = doc_elem.attrib['{http://www.w3.org/XML/1998/namespace}id']
    doc_date = doc_elem.attrib["{http://history.state.gov/frus/ns/1.0}doc-dateTime-max"].split('T')[0]
    doc_date = datetime.strptime(doc_date, '%Y-%m-%d')

    doc_sent_place = doc_elem.find('.//placeName')
    if doc_sent_place is not None:
        doc_sent_place = doc_sent_place.text

    from_list = doc_elem.findall('head//*[@type="from"]')
    to_list = doc_elem.findall('head//*[@type="to"]')

    doc_sent_from = process_from_to_types(from_list)
    doc_sent_to = process_from_to_types(to_list)

    doc_source = doc_elem.find('.//*[@type="source"]')
    if doc_source is not None:
        doc_source = " ".join("".join(doc_source.itertext()).split())

    doc_dict["-".join([subseries_id,volume_id,doc_id])] = {'subseries':subseries_id, 'volume':volume_id, 'doc_id':doc_id,'date':doc_date, 'place':doc_sent_place, 'from':doc_sent_from, 'to':doc_sent_to, 'source':doc_source}

    return

for doc_elem in root.findall('text/body/div//div[@type="document"]'):
    process_document(doc_elem)

In [None]:
doc_dict

In [None]:
place = []

for k,v in doc_dict.items():
    place+=(v['to'])

In [None]:
from collections import Counter

Counter(place)