In [1]:
import re
import pandas as pd
import numpy as np
from datetime import datetime
import xml.etree.ElementTree as ET

In [2]:
ns = {'xml': 'http://www.w3.org/XML/1998/namespace',
      'dflt': 'http://www.tei-c.org/ns/1.0',
      'frus':'http://history.state.gov/frus/ns/1.0',
      'xi':'http://www.w3.org/2001/XInclude'
      }

In [3]:
volume = 'frus1969-76v30.xml'
tree = ET.parse('volumes/frus1969-76v30.xml')
root = tree.getroot()

In [None]:
# year: frus:doc-dateTime-max
# sent from: <placeName>New York</placeName>
# volume from file name
# from: <gloss type="from"> or <persName type="from">
# to: <gloss type="to"> or <persName type="to">
# source: <... type="source" ..>

# ignore all <note/>

In [None]:
# ENTITY("PERSON")

person_df = pd.DataFrame(columns=['id','name','description'])

def extract_person(person_item):
    persName_item = person_item.find('.//dflt:persName', ns)
    person_name = persName_item.text
    person_id = persName_item.attrib['{http://www.w3.org/XML/1998/namespace}id']

    all_text = "".join(person_item.itertext())
    end_idx = all_text.find(person_name) + len(person_name+',')
    person_descp = " ".join(all_text[end_idx:].split())

    person_name = " ".join(re.sub(',',''," ".join(person_name.split(', ')[::-1])).split())

    #person_id = volume[:-4] + '_' + person_id

    global person_df
    person_df = pd.concat((person_df, pd.DataFrame({'id':[person_id],
                                                'name':[person_name],
                                                'description':[person_descp]})),ignore_index=True)
    return


persons_section = root.find("./dflt:text/dflt:front//dflt:div[@xml:id='persons']", ns)
for item in persons_section.findall('.//dflt:item', ns):
    extract_person(item)

person_df.to_csv('tables/person_single_volume.csv')

In [None]:
# ENTITY("YEAR")

year_df = pd.DataFrame({'year':np.arange(1861,1982)})
year_df.to_csv('tables/year.csv')

In [4]:
# ENTITY("DOCUMENT")

def extract_document(doc):

    # id
    id = volume[:-4] + '_' + doc.attrib['{http://www.w3.org/XML/1998/namespace}id']

    # subtype
    subtype = doc.attrib['subtype']

    # date and year
    date = None
    year = None
    if subtype!='editorial-note':
        fmt = doc.attrib['{http://history.state.gov/frus/ns/1.0}doc-dateTime-max']
        date = datetime.strptime(fmt.split('T')[0], '%Y-%m-%d')
        year = datetime.strptime(fmt.split('T')[0], '%Y-%m-%d').year

    # source
    source_tag = doc.find('.//dflt:note[@type="source"]',ns)
    if source_tag is not None:
        source = " ".join(ET.tostring(source_tag, encoding='unicode', method='text').split())
    else:
        source = None

    # title -includes removing note tag!
    head_tag = doc.find('./dflt:head', ns)
    child_note_tags = head_tag.findall('./dflt:note', ns)

    for note_tag in child_note_tags:
        head_tag.remove(note_tag)

    title = " ".join(ET.tostring(head_tag, encoding='unicode', method='text').split())

    # city
    place_tag = doc.find('.//dflt:placeName',ns)
    if place_tag is not None:
        city = place_tag.text
    else:
        city = None

    # era
    era = None

    # person_sentby
    person_sentby = []

    for pers_tag in doc.findall('.//dflt:persName[@type="from"]',ns):
        if pers_tag is not None:
            if 'corresp' in pers_tag.attrib:
                person_sentby.append(pers_tag.attrib['corresp'])
            else:
                txt = (" ".join(pers_tag.itertext()))
                txt = " ".join(txt.split())
                person_sentby.append(txt)

    #docs[0].findall('.//dflt:list',ns)[0].attrib #list -not included yet-

    signed_person_tag = doc.find('.//dflt:signed//dflt:persName',ns)
    if signed_person_tag is not None:
        if 'corresp' in signed_person_tag.attrib:
            person_sentby.append(signed_person_tag.attrib['corresp'])
        else:
            txt = (" ".join(signed_person_tag.itertext()))
            txt = " ".join(txt.split())
            person_sentby.append(txt)

    # person_sentto
    person_sentto = []

    for pers_tag in doc.findall('.//dflt:persName[@type="to"]',ns):
        if pers_tag is not None:
            if 'corresp' in pers_tag.attrib:
                person_sentto.append(pers_tag.attrib['corresp'])
            else:
                txt = (" ".join(pers_tag.itertext()))
                txt = " ".join(txt.split())
                person_sentto.append(txt)

    #docs[0].findall('.//dflt:list[@type="to"]',ns)[0].attrib # list -not included yet-


    # inst_sentby
    inst_sentby = []

    for gloss_tag in doc.findall('.//dflt:gloss[@type="from"]',ns):

        txt = (" ".join(gloss_tag.itertext()))
        txt = " ".join(txt.split())
        inst_sentby.append(txt)

    # inst_sentto
    inst_sentto = []

    for gloss_tag in doc.findall('.//dflt:gloss[@type="to"]',ns):

        txt = (" ".join(gloss_tag.itertext()))
        txt = " ".join(txt.split())
        inst_sentto.append(txt)


    # person_mentioned -includes removing note tag!
    person_mentioned = set()

    notes_parent_tags = doc.findall('.//dflt:note/..',ns)

    for parent_tag in notes_parent_tags:

        for note_tag in parent_tag.findall('./dflt:note',ns):
            parent_tag.remove(note_tag)


    pers_tags = doc.findall('.//dflt:persName[@corresp]',ns)
    for temp_tag in pers_tags:
        person_mentioned.add(temp_tag.attrib['corresp'])


    # inst_mentioned -includes removing note tag!
    instution_mentioned = set()

    inst_tags = doc.findall('.//dflt:gloss[@target]',ns)
    for temp_tag in inst_tags:
        instution_mentioned.add(temp_tag.attrib['target'])

    global doc_df
    ['id','volume','subtype','date','year','title','source','person_sentby',
                                  'person_sentto','city','era','inst_sentby','inst_sentto',
                                  'person_mentioned','inst_mentioned']
    doc_df = pd.concat((doc_df, pd.DataFrame({'id':[id],'volume':[volume[:-4]],'subtype':[subtype],
                                             'date':[date],'year':[year],'title':[title],
                                             'source':[source],'person_sentby':[person_sentby],'person_sentto':[person_sentto],
                                             'city':[city],'era':[era],'inst_sentby':[inst_sentby],
                                             'inst_sentto':[inst_sentto],'person_mentioned':[person_mentioned],
                                             'inst_mentioned':[instution_mentioned]
                                                })),ignore_index=True)
    
    return

doc_df = pd.DataFrame(columns=['id','volume','subtype','date','year','title','source','person_sentby',
                                  'person_sentto','city','era','inst_sentby','inst_sentto',
                                  'person_mentioned','inst_mentioned'])


docs = root.findall('./dflt:text/dflt:body//dflt:div[@type="document"]', ns)
for doc in docs:
    extract_document(doc)

doc_df.to_csv('tables/doc_single_volume.csv')

In [11]:
doc_df['person_sentto']

0      [#p_NRM1]
1             []
2      [#p_NRM1]
3      [#p_SJR1]
4             []
         ...    
242           []
243           []
244           []
245           []
246           []
Name: person_sentto, Length: 247, dtype: object

In [None]:
# ENTITY("ERA")
# ENTITY("COUNTRY")

## done above ^
## construction below 

In [None]:
# this function handles file's sender and receiver
def process_from_to_types(from_list):

    processed_list = []

    for item in from_list:
        
        if item.tag == 'persName' and 'corresp' in item.attrib.keys():
            person_id = item.attrib['corresp']
            processed_list.append(person_id)
        elif item.tag == 'persName':
            processed_list.append(item.text)
        elif item.tag == 'gloss':
            processed_list.append(item.text)
        else:
            raise NotImplementedError('from type tag unidentified')
    
    return processed_list

In [None]:
subseries_id = root.find('teiHeader/fileDesc/titleStmt/title[@type="sub-series"]').text
volume_id = root.find('teiHeader/fileDesc/titleStmt/title[@type="volume-number"]').text

doc_dict = {}

def process_document(doc_elem):

    doc_id = doc_elem.attrib['{http://www.w3.org/XML/1998/namespace}id']
    doc_date = doc_elem.attrib["{http://history.state.gov/frus/ns/1.0}doc-dateTime-max"].split('T')[0]
    doc_date = datetime.strptime(doc_date, '%Y-%m-%d')

    doc_sent_place = doc_elem.find('.//placeName')
    if doc_sent_place is not None:
        doc_sent_place = doc_sent_place.text

    from_list = doc_elem.findall('head//*[@type="from"]')
    to_list = doc_elem.findall('head//*[@type="to"]')

    doc_sent_from = process_from_to_types(from_list)
    doc_sent_to = process_from_to_types(to_list)

    doc_source = doc_elem.find('.//*[@type="source"]')
    if doc_source is not None:
        doc_source = " ".join("".join(doc_source.itertext()).split())

    doc_dict["-".join([subseries_id,volume_id,doc_id])] = {'subseries':subseries_id, 'volume':volume_id, 'doc_id':doc_id,'date':doc_date, 'place':doc_sent_place, 'from':doc_sent_from, 'to':doc_sent_to, 'source':doc_source}

    return

for doc_elem in root.findall('text/body/div//div[@type="document"]'):
    process_document(doc_elem)