In [1]:
import re
import glob
import math
import json
import pandas as pd
import numpy as np
from datetime import datetime
import xml.etree.ElementTree as ET

ns = {'xml': 'http://www.w3.org/XML/1998/namespace',
      'dflt': 'http://www.tei-c.org/ns/1.0',
      'frus':'http://history.state.gov/frus/ns/1.0',
      'xi':'http://www.w3.org/2001/XInclude'
      }

In [2]:
# ENTITY("YEAR")
year_df = pd.DataFrame({'year':np.arange(1861,1982)})
year_df.to_csv('tables/year.csv')

In [3]:
# ENTITY("DOCUMENT")
# ENTITY("PERSON_SENTBY")
# ENTITY("PERSON_SENTTO")
# ENTITY("PERSON_MENTIONED")
# ENTITY("INST_SENTBY")
# ENTITY("INST_SENTTO")
# ENTITY("INST_MENTIONED")

def extract_document(doc, volume):

    global doc_df

    global person_sentby_df
    global person_sentto_df
    global person_mentioned_df

    #global instution_sentby_df
    #global instution_sentto_df
    global instution_mentioned_df

    # id
    id_to_text = volume + '_' + doc.attrib['{http://www.w3.org/XML/1998/namespace}id']

    # subtype
    subtype = doc.attrib['subtype']

    # date and year and era
    date = None
    year = None
    era = None
    if subtype!='editorial-note':
        fmt = doc.attrib['{http://history.state.gov/frus/ns/1.0}doc-dateTime-max']
        date = datetime.strptime(fmt.split('T')[0], '%Y-%m-%d')
        year = datetime.strptime(fmt.split('T')[0], '%Y-%m-%d').year
        era = era_df[(era_df['startDate'] <= date) & (era_df['endDate'] > date)].president.values[0]

    # source
    source_tag = doc.find('.//dflt:note[@type="source"]',ns)
    if source_tag is not None:
        source = " ".join(ET.tostring(source_tag, encoding='unicode', method='text').split())
    else:
        source = None

    # title -includes removing note tag!
    head_tag = doc.find('./dflt:head', ns)
    child_note_tags = head_tag.findall('./dflt:note', ns)

    for note_tag in child_note_tags:
        head_tag.remove(note_tag)

    title = " ".join(ET.tostring(head_tag, encoding='unicode', method='text').split())

    # city
    place_tag = doc.find('.//dflt:placeName',ns)
    if place_tag is not None:
        txt = "".join(place_tag.itertext())
        txt = " ".join(txt.split())
        txt = " ".join(txt.split(',')[0].split())
        city = city_lookup_dict[txt]
        if city!=txt:
            print(txt,city)
    else:
        city = None

    # person_sentby
    person_sentby = []

    for pers_tag in doc.findall('.//dflt:persName[@type="from"]',ns):
        if pers_tag is not None: 
            if 'corresp' in pers_tag.attrib:
                if pers_tag.attrib['corresp'][0]=='#':
                    person_id = pers_tag.attrib['corresp'][1:]
                else:
                    person_id = pers_tag.attrib['corresp']
                person_id = volume + '_' + person_id
                person_name = person_id_lookup_dict[person_id]
                person_sentby.append(person_name)
                person_sentby_df = pd.concat((person_sentby_df, 
                                            pd.DataFrame({'person_name':[person_name],'sent':[id]})),
                                            ignore_index=True)
            else:
                txt = (" ".join(pers_tag.itertext()))
                txt = " ".join(txt.split())
                person_sentby.append(txt)

    #docs[0].findall('.//dflt:list',ns)[0].attrib #list -not included yet-

    signed_person_tag = doc.find('.//dflt:signed//dflt:persName',ns)
    if signed_person_tag is not None:
        if 'corresp' in signed_person_tag.attrib:
            person_id = signed_person_tag.attrib['corresp'][1:]
            if signed_person_tag.attrib['corresp'][0]=='#':
                person_id = signed_person_tag.attrib['corresp'][1:]
            else:
                person_id = signed_person_tag.attrib['corresp']
            person_id = volume + '_' + person_id
            person_name = person_id_lookup_dict[person_id]
            person_sentby.append(person_name)
            person_sentby_df = pd.concat((person_sentby_df, 
                                        pd.DataFrame({'person_name':[person_name],'sent':[id]})),
                                        ignore_index=True)
        else:
            txt = (" ".join(signed_person_tag.itertext()))
            txt = " ".join(txt.split())
            person_sentby.append(txt)

    # person_sentto
    person_sentto = []

    for pers_tag in doc.findall('.//dflt:persName[@type="to"]',ns):
        if pers_tag is not None:
            if 'corresp' in pers_tag.attrib:
                if pers_tag.attrib['corresp'][0]=='#':
                    person_id = pers_tag.attrib['corresp'][1:]
                else:
                    person_id = pers_tag.attrib['corresp']
                person_id = volume + '_' + person_id
                person_name = person_id_lookup_dict[person_id]
                person_sentto.append(person_name)
                person_sentto_df = pd.concat((person_sentto_df, 
                                            pd.DataFrame({'person_name':[person_name],'received':[id]})),
                                            ignore_index=True)
            else:
                txt = (" ".join(pers_tag.itertext()))
                txt = " ".join(txt.split())
                person_sentto.append(txt)

    #docs[0].findall('.//dflt:list[@type="to"]',ns)[0].attrib # list -not included yet-


    # inst_sentby
    inst_sentby = []

    for gloss_tag in doc.findall('.//dflt:gloss[@type="from"]',ns):

        txt = (" ".join(gloss_tag.itertext()))
        txt = " ".join(txt.split())
        inst_sentby.append(txt)

    # inst_sentto
    inst_sentto = []

    for gloss_tag in doc.findall('.//dflt:gloss[@type="to"]',ns):

        txt = (" ".join(gloss_tag.itertext()))
        txt = " ".join(txt.split())
        inst_sentto.append(txt)


    # person_mentioned -includes removing note tag!
    person_mentioned = set()

    notes_parent_tags = doc.findall('.//dflt:note/..',ns)

    for parent_tag in notes_parent_tags:

        for note_tag in parent_tag.findall('./dflt:note',ns):
            parent_tag.remove(note_tag)


    pers_tags = doc.findall('.//dflt:persName[@corresp]',ns)
    for temp_tag in pers_tags:
        if temp_tag.attrib['corresp'][0]=='#':
            person_id = temp_tag.attrib['corresp'][1:]
        else:
            person_id = temp_tag.attrib['corresp']
        person_id = volume + '_' + person_id
        person_name = person_id_lookup_dict[person_id]
        person_mentioned.add(person_name)
        person_mentioned_df = pd.concat((person_mentioned_df, 
                                    pd.DataFrame({'person_name':[person_name],'mentioned_in':[id]})),
                                    ignore_index=True)


    # inst_mentioned -includes removing note tag!
    instution_mentioned = set()

    inst_tags = doc.findall('.//dflt:gloss[@target]',ns)
    for temp_tag in inst_tags:
        inst_id = temp_tag.attrib['target'][1:]
        instution_mentioned.add(inst_id)
        instution_mentioned_df = pd.concat((instution_mentioned_df, 
                                    pd.DataFrame({'instution_id':[inst_id],'mentioned_in':[id]})),
                                    ignore_index=True)

    # free text
    free_text = ""

    tag_list = doc.findall('./*',ns)
    
    # find free text's start and end elements
    lidx,ridx = 0,0

    for idx,tag in enumerate(tag_list):
        if tag.tag not in not_text_tags:
            lidx = idx
            break
    
    for idx,tag in enumerate(tag_list[::-1]):
        if tag.tag in text_tags:
            ridx = len(tag_list)-1-idx
            break
    
    # remove all <note> in free text
    notes_parent_tags = doc.findall('.//dflt:note/..',ns)

    for parent_tag in notes_parent_tags:

        for note_tag in parent_tag.findall('./dflt:note',ns):
            parent_tag.remove(note_tag)

    # join free text pieces
    for f_tag in tag_list[lidx:ridx+1]:
        free_text += " ".join("".join(f_tag.itertext()).split()) + " "
    

    doc_df = pd.concat((doc_df, pd.DataFrame({'id_to_text':[id_to_text],'volume':[volume],'subtype':[subtype],
                                             'date':[date],'year':[year],'title':[title],
                                             'source':[source],'person_sentby':[person_sentby],'person_sentto':[person_sentto],
                                             'city':[city],'era':[era],'inst_sentby':[inst_sentby],
                                             'inst_sentto':[inst_sentto],'person_mentioned':[person_mentioned],
                                             'inst_mentioned':[instution_mentioned],'text':[free_text]
                                            })),ignore_index=True)
    
    return


# city lookup table for unification
with open('tables/city_lookup_dict.json', 'r') as f:
    city_lookup_dict = json.load(f)

# person id to unified name lookup table
new_unified_person_df = pd.read_parquet('tables/new_unified_person_df_wikicol.parquet')

person_id_lookup_dict = {} # 'id':'corrected'
for _, row in new_unified_person_df.iterrows():

    for id in row['id_list']:
        if id not in person_id_lookup_dict:
            person_id_lookup_dict[id] = row['name_set']

# defining useful tag lists for free text's extraction
not_text_tags = ['{http://www.tei-c.org/ns/1.0}head',
                '{http://www.tei-c.org/ns/1.0}opener',
                '{http://www.tei-c.org/ns/1.0}dateline',
                '{http://www.tei-c.org/ns/1.0}note',
                '{http://www.tei-c.org/ns/1.0}table',]
text_tags = ['{http://www.tei-c.org/ns/1.0}p',
            '{http://www.tei-c.org/ns/1.0}list']

era_df = pd.read_csv('tables/era.csv')
era_df['startDate'] = era_df['startDate'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))
era_df['endDate'] = era_df['endDate'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))

doc_df = pd.DataFrame(columns=['id_to_text','volume','subtype','date','year','title','source','person_sentby',
                                  'person_sentto','city','era','inst_sentby','inst_sentto',
                                  'person_mentioned','inst_mentioned','text'])

person_sentby_df = pd.DataFrame(columns=['person_name','sent'])
person_sentto_df = pd.DataFrame(columns=['person_name','received'])
person_mentioned_df = pd.DataFrame(columns=['person_name','mentioned_in'])

#instution_sentby_df = pd.DataFrame(columns=['instution_id','sent'])
#instution_sentto_df = pd.DataFrame(columns=['instution_id','received'])
instution_mentioned_df = pd.DataFrame(columns=['instution_id','mentioned_in'])


# only use documents within this years
volume_root = 'frus1969-76'

#for file in glob.glob('volumes/'+volume_root+'*'):
for file in glob.glob('volumes/'+volume_root+'v30.xml'):
    volume = file[8:-4]

    tree = ET.parse(file)
    root = tree.getroot()

    docs = root.findall('./dflt:text/dflt:body//dflt:div[@type="document"]', ns)
    for doc in docs:
        extract_document(doc, volume)



doc_df.to_csv('tables/doc_69_76v30.csv')
#person_sentby_df.to_csv('tables/person_sentby_69_76.csv')
#person_sentto_df.to_csv('tables/person_sentto_69_76.csv')
#person_mentioned_df.to_csv('tables/person_mentioned_69_76.csv')

In [4]:
doc_df

Unnamed: 0,id_to_text,volume,subtype,date,year,title,source,person_sentby,person_sentto,city,era,inst_sentby,inst_sentto,person_mentioned,inst_mentioned,text
0,frus1969-76v30_d1,frus1969-76v30,historical-document,1973-03-30,1973,1. Memorandum From the President’s Assistant f...,"Source: National Archives, Nixon Presidential ...",[A. Henry Kissinger],[M. Nixon Richard],Washington,Richard M. Nixon,[],[],"{A. Henry Kissinger, M. Nixon Richard, Col. Ge...","{t_NATO1, t_FY1}",SUBJECT Letter to Prime Minister Papadopoulos ...
1,frus1969-76v30_d2,frus1969-76v30,historical-document,1973-04-21,1973,2. Telegram From the Embassy in Greece to the ...,"Source: National Archives, RG 59, Central File...",[Henry Tasca],[],Athens,Richard M. Nixon,[the Embassy in Greece],[the Department of State],"{Henry Tasca, Agnew Spiro T., Col. George Lt. ...","{t_PM1, t_FRG1, t_HMG1, t_GOG1}",2400. Subj: Greek Prime Minister Confronts Ser...
2,frus1969-76v30_d3,frus1969-76v30,historical-document,1973-06-12,1973,3. Memorandum From Acting Secretary of State R...,"Source: Ford Library, National Security Advise...","[Kenneth Rush, Kenneth Rush]",[M. Nixon Richard],Washington,Richard M. Nixon,[],[],"{Col. George Lt. Papadopoulos, M. Nixon Richar...","{t_NATO1, t_NSSM1}",SUBJECT Reappraisal of our Greek Policy Events...
3,frus1969-76v30_d4,frus1969-76v30,historical-document,1973-06-26,1973,4. Memorandum From the Chief of the Near East ...,"[Source: Central Intelligence Agency, Executiv...",[Waller],[James Schlesinger],Washington,Richard M. Nixon,[],[],{James Schlesinger},{},4. Memorandum From the Chief of the Near East ...
4,frus1969-76v30_d5,frus1969-76v30,historical-document,1973-07-19,1973,5. National Intelligence Estimate,"Source: Central Intelligence Agency, NIC Files...",[],[],Washington,Richard M. Nixon,[],[],"{Col. George Lt. Papadopoulos, Anghelis Gen. L...","{t_NATO1, t_NIE1}",[Omitted here is a table of contents.] SHORT-T...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,frus1969-76v30_d243,frus1969-76v30,historical-document,1976-07-29,1976,243. Memorandum of Conversation,"Source: Library of Congress, Manuscript Divisi...",[],[],Washington,Gerald R. Ford,[],[],"{A. Henry Kissinger, Brent C. Scowcroft, Kurt ...","{t_UN1, t_NATO1, t_CIA1, t_ICJ1}",PARTICIPANTS President Ford Dr. Henry A. Kissi...
243,frus1969-76v30_d244,frus1969-76v30,historical-document,1976-07-29,1976,244. Memorandum of Conversation,"Source: Library of Congress, Manuscript Divisi...",[],[],Washington,Gerald R. Ford,[],[],"{A. Henry Kissinger, Clerides Glaufkos, III Ma...","{t_RPP1, t_EURSE1}",SUBJECT Meeting with Former Prime Minister of ...
244,frus1969-76v30_d245,frus1969-76v30,historical-document,1976-08-14,1976,245. Memorandum of Conversation,"Source: Library of Congress, Manuscript Divisi...",[],[],New York,Gerald R. Ford,[],[],"{A. Henry Kissinger, Constantine Karamanlis, I...","{t_UK1, t_DCA1, t_ICJ1}",PARTICIPANTS US: The Secretary Under Secretary...
245,frus1969-76v30_d246,frus1969-76v30,historical-document,1976-09-29,1976,246. Memorandum of Conversation,"Source: Library of Congress, Manuscript Divisi...",[],[],New York,Gerald R. Ford,[],[],"{A. John Tzounis, A. Andrei Gromyko, B. Macomb...","{t_UN1, t_EC1, t_ICJ1, t_UK1}",SUBJECT Secretary’s Meeting with Turkish Forei...


In [None]:
# ENTITY("INSTUTION")

In [None]:
# extracting texts

volume_root = 'frus1969-76'

not_text_tags = ['{http://www.tei-c.org/ns/1.0}head',
                '{http://www.tei-c.org/ns/1.0}opener',
                '{http://www.tei-c.org/ns/1.0}dateline',
                '{http://www.tei-c.org/ns/1.0}note',
                '{http://www.tei-c.org/ns/1.0}table',]

text_tags = ['{http://www.tei-c.org/ns/1.0}p',
            '{http://www.tei-c.org/ns/1.0}list']


#for file in glob.glob('volumes/'+volume_root+'*'):
for file in glob.glob('volumes/'+volume_root+'v30.xml'):
    volume = file[8:-4]

    tree = ET.parse(file)
    root = tree.getroot()

    docs = root.findall('./dflt:text/dflt:body//dflt:div[@type="document"]', ns)


    for idx,doc in enumerate(docs):
        
        free_text = ""

        tag_list = doc.findall('./*',ns)
        
        # find free text's start and end elements
        lidx,ridx = 0,0

        for idx,tag in enumerate(tag_list):
            if tag.tag not in not_text_tags:
                lidx = idx
                break
        
        for idx,tag in enumerate(tag_list[::-1]):
            if tag.tag in text_tags:
                ridx = len(tag_list)-1-idx
                break
        
        # remove all <note> in free text
        notes_parent_tags = doc.findall('.//dflt:note/..',ns)

        for parent_tag in notes_parent_tags:

            for note_tag in parent_tag.findall('./dflt:note',ns):
                parent_tag.remove(note_tag)


        for f_tag in tag_list[lidx:ridx+1]:
            free_text += " ".join("".join(f_tag.itertext()).split()) + " "
        
        print(volume,free_text)
        
    break
    
