In [1]:
import re
import math
import pandas as pd
import numpy as np
from datetime import datetime
import xml.etree.ElementTree as ET

In [2]:
ns = {'xml': 'http://www.w3.org/XML/1998/namespace',
      'dflt': 'http://www.tei-c.org/ns/1.0',
      'frus':'http://history.state.gov/frus/ns/1.0',
      'xi':'http://www.w3.org/2001/XInclude'
      }

In [3]:
volume = 'frus1969-76v30.xml'
tree = ET.parse('volumes/frus1969-76v30.xml')
root = tree.getroot()

In [None]:
# year: frus:doc-dateTime-max
# sent from: <placeName>New York</placeName>
# volume from file name
# from: <gloss type="from"> or <persName type="from">
# to: <gloss type="to"> or <persName type="to">
# source: <... type="source" ..>

# ignore all <note/>

In [None]:
# ENTITY("PERSON")

person_df = pd.DataFrame(columns=['id','name','description'])

def extract_person(person_item):
    persName_item = person_item.find('.//dflt:persName', ns)
    person_name = persName_item.text
    person_id = persName_item.attrib['{http://www.w3.org/XML/1998/namespace}id']

    all_text = "".join(person_item.itertext())
    end_idx = all_text.find(person_name) + len(person_name+',')
    person_descp = " ".join(all_text[end_idx:].split())

    person_name = " ".join(re.sub(',',''," ".join(person_name.split(', ')[::-1])).split())

    #person_id = volume[:-4] + '_' + person_id

    global person_df
    person_df = pd.concat((person_df, pd.DataFrame({'id':[person_id],
                                                'name':[person_name],
                                                'description':[person_descp]})),ignore_index=True)
    return


persons_section = root.find("./dflt:text/dflt:front//dflt:div[@xml:id='persons']", ns)
for item in persons_section.findall('.//dflt:item', ns):
    extract_person(item)

person_df.to_csv('tables/person_single_volume.csv')

In [None]:
# ENTITY("INSTUTION")

In [None]:
# ENTITY("YEAR")

year_df = pd.DataFrame({'year':np.arange(1861,1982)})
year_df.to_csv('tables/year.csv')

In [4]:
era_df = pd.read_csv('tables/era.csv')
era_df['startDate'] = era_df['startDate'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))
era_df['endDate'] = era_df['endDate'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))

In [5]:
# ENTITY("DOCUMENT")
# ENTITY("PERSON_SENTBY")
# ENTITY("PERSON_SENTTO")
# ENTITY("PERSON_MENTIONED")
# ENTITY("INST_SENTBY")
# ENTITY("INST_SENTTO")
# ENTITY("INST_MENTIONED")

def extract_document(doc):

    global doc_df

    global person_sentby_df
    global person_sentto_df
    global person_mentioned_df

    #global instution_sentby_df
    #global instution_sentto_df
    global instution_mentioned_df

    # id
    id = volume[:-4] + '_' + doc.attrib['{http://www.w3.org/XML/1998/namespace}id']

    # subtype
    subtype = doc.attrib['subtype']

    # date and year and era
    date = None
    year = None
    era = None
    if subtype!='editorial-note':
        fmt = doc.attrib['{http://history.state.gov/frus/ns/1.0}doc-dateTime-max']
        date = datetime.strptime(fmt.split('T')[0], '%Y-%m-%d')
        year = datetime.strptime(fmt.split('T')[0], '%Y-%m-%d').year
        era = era_df[(era_df['startDate'] <= date) & (era_df['endDate'] > date)].president.values[0]

    # source
    source_tag = doc.find('.//dflt:note[@type="source"]',ns)
    if source_tag is not None:
        source = " ".join(ET.tostring(source_tag, encoding='unicode', method='text').split())
    else:
        source = None

    # title -includes removing note tag!
    head_tag = doc.find('./dflt:head', ns)
    child_note_tags = head_tag.findall('./dflt:note', ns)

    for note_tag in child_note_tags:
        head_tag.remove(note_tag)

    title = " ".join(ET.tostring(head_tag, encoding='unicode', method='text').split())

    # city
    place_tag = doc.find('.//dflt:placeName',ns)
    if place_tag is not None:
        city = place_tag.text
    else:
        city = None

    # person_sentby
    person_sentby = []

    for pers_tag in doc.findall('.//dflt:persName[@type="from"]',ns):
        if pers_tag is not None:
            if 'corresp' in pers_tag.attrib:
                person_id = pers_tag.attrib['corresp'][1:]
                person_sentby.append(person_id)
                person_sentby_df = pd.concat((person_sentby_df, 
                                            pd.DataFrame({'person_id':[person_id],'sent':[id]})),
                                            ignore_index=True)
            else:
                txt = (" ".join(pers_tag.itertext()))
                txt = " ".join(txt.split())
                person_sentby.append(txt)

    #docs[0].findall('.//dflt:list',ns)[0].attrib #list -not included yet-

    signed_person_tag = doc.find('.//dflt:signed//dflt:persName',ns)
    if signed_person_tag is not None:
        if 'corresp' in signed_person_tag.attrib:
            person_id = signed_person_tag.attrib['corresp'][1:]
            person_sentby.append(person_id)
            person_sentby_df = pd.concat((person_sentby_df, 
                                        pd.DataFrame({'person_id':[person_id],'sent':[id]})),
                                        ignore_index=True)
        else:
            txt = (" ".join(signed_person_tag.itertext()))
            txt = " ".join(txt.split())
            person_sentby.append(txt)

    # person_sentto
    person_sentto = []

    for pers_tag in doc.findall('.//dflt:persName[@type="to"]',ns):
        if pers_tag is not None:
            if 'corresp' in pers_tag.attrib:
                person_id = pers_tag.attrib['corresp'][1:]
                person_sentto.append(person_id)
                person_sentto_df = pd.concat((person_sentto_df, 
                                            pd.DataFrame({'person_id':[person_id],'received':[id]})),
                                            ignore_index=True)
            else:
                txt = (" ".join(pers_tag.itertext()))
                txt = " ".join(txt.split())
                person_sentto.append(txt)

    #docs[0].findall('.//dflt:list[@type="to"]',ns)[0].attrib # list -not included yet-


    # inst_sentby
    inst_sentby = []

    for gloss_tag in doc.findall('.//dflt:gloss[@type="from"]',ns):

        txt = (" ".join(gloss_tag.itertext()))
        txt = " ".join(txt.split())
        inst_sentby.append(txt)

    # inst_sentto
    inst_sentto = []

    for gloss_tag in doc.findall('.//dflt:gloss[@type="to"]',ns):

        txt = (" ".join(gloss_tag.itertext()))
        txt = " ".join(txt.split())
        inst_sentto.append(txt)


    # person_mentioned -includes removing note tag!
    person_mentioned = set()

    notes_parent_tags = doc.findall('.//dflt:note/..',ns)

    for parent_tag in notes_parent_tags:

        for note_tag in parent_tag.findall('./dflt:note',ns):
            parent_tag.remove(note_tag)


    pers_tags = doc.findall('.//dflt:persName[@corresp]',ns)
    for temp_tag in pers_tags:
        person_id = temp_tag.attrib['corresp'][1:]
        person_mentioned.add(person_id)
        person_mentioned_df = pd.concat((person_mentioned_df, 
                                    pd.DataFrame({'person_id':[person_id],'mentioned_in':[id]})),
                                    ignore_index=True)


    # inst_mentioned -includes removing note tag!
    instution_mentioned = set()

    inst_tags = doc.findall('.//dflt:gloss[@target]',ns)
    for temp_tag in inst_tags:
        inst_id = temp_tag.attrib['target'][1:]
        instution_mentioned.add(inst_id)
        instution_mentioned_df = pd.concat((instution_mentioned_df, 
                                    pd.DataFrame({'instution_id':[inst_id],'mentioned_in':[id]})),
                                    ignore_index=True)

    doc_df = pd.concat((doc_df, pd.DataFrame({'id':[id],'volume':[volume[:-4]],'subtype':[subtype],
                                             'date':[date],'year':[year],'title':[title],
                                             'source':[source],'person_sentby':[person_sentby],'person_sentto':[person_sentto],
                                             'city':[city],'era':[era],'inst_sentby':[inst_sentby],
                                             'inst_sentto':[inst_sentto],'person_mentioned':[person_mentioned],
                                             'inst_mentioned':[instution_mentioned]
                                                })),ignore_index=True)
    
    return

doc_df = pd.DataFrame(columns=['id','volume','subtype','date','year','title','source','person_sentby',
                                  'person_sentto','city','era','inst_sentby','inst_sentto',
                                  'person_mentioned','inst_mentioned'])

person_sentby_df = pd.DataFrame(columns=['person_id','sent'])
person_sentto_df = pd.DataFrame(columns=['person_id','received'])
person_mentioned_df = pd.DataFrame(columns=['person_id','mentioned_in'])

#instution_sentby_df = pd.DataFrame(columns=['instution_id','sent'])
#instution_sentto_df = pd.DataFrame(columns=['instution_id','received'])
instution_mentioned_df = pd.DataFrame(columns=['instution_id','mentioned_in'])


docs = root.findall('./dflt:text/dflt:body//dflt:div[@type="document"]', ns)
for doc in docs:
    extract_document(doc)

doc_df.to_csv('tables/doc_single_volume.csv')
person_sentby_df.to_csv('tables/person_sentby_single_volume.csv')
person_sentto_df.to_csv('tables/person_sentto_single_volume.csv')
person_mentioned_df.to_csv('tables/person_mentioned_single_volume.csv')

In [7]:
doc_df

Unnamed: 0,id,volume,subtype,date,year,title,source,person_sentby,person_sentto,city,era,inst_sentby,inst_sentto,person_mentioned,inst_mentioned
0,frus1969-76v30_d1,frus1969-76v30,historical-document,1973-03-30,1973,1. Memorandum From the President’s Assistant f...,"Source: National Archives, Nixon Presidential ...",[p_KHA1],[p_NRM1],Washington,Richard M. Nixon,[],[],"{p_NRM1, p_PCG1, p_KHA1}","{t_NATO1, t_FY1}"
1,frus1969-76v30_d2,frus1969-76v30,historical-document,1973-04-21,1973,2. Telegram From the Embassy in Greece to the ...,"Source: National Archives, RG 59, Central File...",[p_THJ1],[],Athens,Richard M. Nixon,[the Embassy in Greece],[the Department of State],"{p_GG5, p_TC5, p_AST1, p_PCG1, p_BWHF1, p_IBGD...","{t_GOG1, t_FRG1, t_HMG1, t_PM1}"
2,frus1969-76v30_d3,frus1969-76v30,historical-document,1973-06-12,1973,3. Memorandum From Acting Secretary of State R...,"Source: Ford Library, National Security Advise...","[p_RKW1, p_RKW1]",[p_NRM1],Washington,Richard M. Nixon,[],[],"{p_NRM1, p_PCG1, p_RKW1, p_THJ1}","{t_NSSM1, t_NATO1}"
3,frus1969-76v30_d4,frus1969-76v30,historical-document,1973-06-26,1973,4. Memorandum From the Chief of the Near East ...,"[Source: Central Intelligence Agency, Executiv...",[Waller],[p_SJR1],Washington,Richard M. Nixon,[],[],{p_SJR1},{}
4,frus1969-76v30_d5,frus1969-76v30,historical-document,1973-07-19,1973,5. National Intelligence Estimate,"Source: Central Intelligence Agency, NIC Files...",[],[],Washington,Richard M. Nixon,[],[],"{p_PCG1, p_IBGD1, p_CKC1, p_AGO2}","{t_NATO1, t_NIE1}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,frus1969-76v30_d243,frus1969-76v30,historical-document,1976-07-29,1976,243. Memorandum of Conversation,"Source: Library of Congress, Manuscript Divisi...",[],[],Washington,Gerald R. Ford,[],[],"{p_WK5, p_DS1, p_EM2, p_SB3, p_EB1, p_MIII1, p...","{t_ICJ1, t_NATO1, t_CIA1, t_UN1}"
243,frus1969-76v30_d244,frus1969-76v30,historical-document,1976-07-29,1976,244. Memorandum of Conversation,"Source: Library of Congress, Manuscript Divisi...",[],[],Washington,Gerald R. Ford,[],[],"{p_BD6, p_EM2, p_EB1, p_CG3, p_HAA4, p_MIII1, ...","{t_EURSE1, t_RPP1}"
244,frus1969-76v30_d245,frus1969-76v30,historical-document,1976-08-14,1976,245. Memorandum of Conversation,"Source: Library of Congress, Manuscript Divisi...",[],[],New York,Gerald R. Ford,[],[],"{p_PA9, p_BD6, p_DS1, p_EM2, p_JUA1, p_EB1, p_...","{t_UK1, t_ICJ1, t_DCA1}"
245,frus1969-76v30_d246,frus1969-76v30,historical-document,1976-09-29,1976,246. Memorandum of Conversation,"Source: Library of Congress, Manuscript Divisi...",[],[],New York,Gerald R. Ford,[],[],"{p_BD6, p_EM2, p_HAA4, p_HPC1, p_GAA1, p_MWBJ1...","{t_ICJ1, t_EC1, t_UK1, t_UN1}"


In [None]:
# ENTITY("COUNTRY")