In [1]:
import re
from datetime import datetime
import xml.etree.ElementTree as ET

In [2]:
tree = ET.parse('frus1969-76v30.xml')
root = tree.getroot()

In [None]:
# remove namespace
# year: frus:doc-dateTime-max
# sent from: <placeName>New York</placeName>
# volume from file name
# from: <gloss type="from"> or <persName type="from">
# to: <gloss type="to"> or <persName type="to">
# source: <... type="source" ..>

# ignore all <note/>

In [3]:
person_item_list = root.findall("text/front/div/[@{http://www.w3.org/XML/1998/namespace}id='persons']")[0].findall('.//item')
person_dict = {}

def extract_person(person_item):
    persName_item = person_item.find('.//persName')
    person_name = persName_item.text
    person_id = persName_item.attrib['{http://www.w3.org/XML/1998/namespace}id']

    all_text = "".join(person_item.itertext())
    end_idx = all_text.find(person_name) + len(person_name+',')
    person_descp = " ".join(all_text[end_idx:].split())

    person_name = " ".join(re.sub(',',''," ".join(person_name.split(', ')[::-1])).split())

    person_dict[person_id] = {'name':person_name, 'descp':person_descp}
    return

for item in root.findall("text/front/div/[@{http://www.w3.org/XML/1998/namespace}id='persons']")[0].findall('.//item'):
    extract_person(item)

In [8]:
person_dict

{'p_AST1': {'name': 'Spiro T. Agnew',
  'descp': 'Vice President until October 10, 1973'},
 'p_AC8': {'name': 'Carl Albert',
  'descp': 'Democratic Congressman from Oklahoma; Speaker of the House'},
 'p_AM4': {'name': 'Menelaos Alexandrakis',
  'descp': 'Greek Ambassador to the United States from August 1974'},
 'p_ARB3': {'name': 'Robert Anderson',
  'descp': 'Department of State spokesman'},
 'p_AA3': {'name': 'Adamantios Androutsopoulos',
  'descp': 'Greek Prime Minister from November 1973 until July 1974'},
 'p_AGO2': {'name': 'Gen. Odysseus Angelis',
  'descp': 'Chief of Staff, Hellenic Armed Forces until 1973; Greek Vice President until July 1973'},
 'p_AWH1': {'name': 'Walter H. Annenberg',
  'descp': 'Ambassador to United Kingdom until October 1974'},
 'p_AALRJ1': {'name': 'Jr. Alfred L. Atherton',
  'descp': 'Deputy Assistant Secretary of State for Near Eastern and South Asian Affairs until April 1974'},
 'p_ATE1': {'name': 'Evangelos Averoff-Tositsas',
  'descp': 'Greek Minis

## done above ^
## construction below 

In [4]:
# this function handles file's sender and receiver
def process_from_to_types(from_list):

    processed_list = []

    for item in from_list:
        
        if item.tag == 'persName' and 'corresp' in item.attrib.keys():
            person_id = item.attrib['corresp']
            processed_list.append(person_id)
        elif item.tag == 'persName':
            processed_list.append(item.text)
        elif item.tag == 'gloss':
            processed_list.append(item.text)
        else:
            raise NotImplementedError('from type tag unidentified')
    
    return processed_list

In [5]:
subseries_id = root.find('teiHeader/fileDesc/titleStmt/title[@type="sub-series"]').text
volume_id = root.find('teiHeader/fileDesc/titleStmt/title[@type="volume-number"]').text

doc_dict = {}

def process_document(doc_elem):

    doc_id = doc_elem.attrib['{http://www.w3.org/XML/1998/namespace}id']
    doc_date = doc_elem.attrib["{http://history.state.gov/frus/ns/1.0}doc-dateTime-max"].split('T')[0]
    doc_date = datetime.strptime(doc_date, '%Y-%m-%d')

    doc_sent_place = doc_elem.find('.//placeName')
    if doc_sent_place is not None:
        doc_sent_place = doc_sent_place.text

    from_list = doc_elem.findall('head//*[@type="from"]')
    to_list = doc_elem.findall('head//*[@type="to"]')

    doc_sent_from = process_from_to_types(from_list)
    doc_sent_to = process_from_to_types(to_list)

    doc_source = doc_elem.find('.//*[@type="source"]')
    if doc_source is not None:
        doc_source = " ".join("".join(doc_source.itertext()).split())

    doc_dict["-".join([subseries_id,volume_id,doc_id])] = {'subseries':subseries_id, 'volume':volume_id, 'doc_id':doc_id,'date':doc_date, 'place':doc_sent_place, 'from':doc_sent_from, 'to':doc_sent_to, 'source':doc_source}

    return

for doc_elem in root.findall('text/body/div//div[@type="document"]'):
    process_document(doc_elem)

In [9]:
doc_dict

{'1973–1976-Volume XXX-d1': {'subseries': '1973–1976',
  'volume': 'Volume XXX',
  'doc_id': 'd1',
  'date': datetime.datetime(1973, 3, 30, 0, 0),
  'place': 'Washington',
  'from': ['#p_KHA1'],
  'to': ['#p_NRM1'],
  'source': 'Source: National Archives, Nixon Presidential Materials, NSC Files, Box 754, Presidential Correspondence File, Greece (Papadopoulos). Secret. Sent for action.'},
 '1973–1976-Volume XXX-d2': {'subseries': '1973–1976',
  'volume': 'Volume XXX',
  'doc_id': 'd2',
  'date': datetime.datetime(1973, 4, 21, 0, 0),
  'place': 'Athens',
  'from': ['the Embassy in Greece'],
  'to': ['the Department of State'],
  'source': 'Source: National Archives, RG 59, Central Files 1970–73, POL 15–1 GREECE. Secret; Exdis.'},
 '1973–1976-Volume XXX-d3': {'subseries': '1973–1976',
  'volume': 'Volume XXX',
  'doc_id': 'd3',
  'date': datetime.datetime(1973, 6, 12, 0, 0),
  'place': 'Washington',
  'from': ['#p_RKW1'],
  'to': ['#p_NRM1'],
  'source': 'Source: Ford Library, National Se

In [6]:
place = []

for k,v in doc_dict.items():
    place+=(v['to'])

In [7]:
from collections import Counter

Counter(place)

Counter({'#p_NRM1': 2,
         'the Department of State': 35,
         '#p_SJR1': 2,
         '#p_KHA1': 25,
         'the Department of\n                                State': 1,
         'the Mission to the North Atlantic Treaty\n                                Organization': 1,
         'the Departments of\n                                State and Defense': 2,
         'Department of State': 5,
         '#p_SB3': 9,
         '#p_FGR1': 18,
         'the Mission to the North Atlantic Treaty\n                                Organization and the Embassy in Greece': 2,
         'the Embassy in Greece': 2,
         'the Embassies in Greece and Cyprus': 1,
         '#p_DAF1': 2,
         '#p_WK5': 1,
         '#p_CLJ1': 3,
         'Secretary of Defense Schlesinger': 1,
         'Large McCloskey': 1,
         '#p_CWE1': 1,
         '#p_IRS1': 2,
         '#p_BWB1': 1,
         '#p_EB1': 3,
         '#p_HBMJ1': 2,
         '#p_CKC1': 1,
         'the Embassy in Turkey': 1,
         '#p_