In [1]:
import re
import glob
import math
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
import xml.etree.ElementTree as ET
import ray
ray.init(num_cpus=13)

ns = {'xml': 'http://www.w3.org/XML/1998/namespace',
      'dflt': 'http://www.tei-c.org/ns/1.0',
      'frus':'http://history.state.gov/frus/ns/1.0',
      'xi':'http://www.w3.org/2001/XInclude'
      }

In [2]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
from lexicalrichness import LexicalRichness

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')

tables_path = 'tables/tables_52_62/'

In [3]:
# ENTITY("DOCUMENT")
# ENTITY("PERSON_SENTBY")
# ENTITY("PERSON_SENTTO")
# ENTITY("PERSON_MENTIONED")
# ENTITY("INST_MENTIONED")

def extract_document(doc, volume):

    #global doc_df

    #global person_sentby_df
    #global person_sentto_df
    #global person_mentioned_df

    #global instution_mentioned_df

    # id
    id_to_text = volume + '_' + doc.attrib['{http://www.w3.org/XML/1998/namespace}id']

    # subtype
    subtype = doc.attrib['subtype']

    # date and year and era
    date = None
    year = None
    era = None
    if subtype!='editorial-note':
        fmt = doc.attrib['{http://history.state.gov/frus/ns/1.0}doc-dateTime-max']
        date = datetime.strptime(fmt.split('T')[0], '%Y-%m-%d')
        year = datetime.strptime(fmt.split('T')[0], '%Y-%m-%d').year
        era = era_df[(era_df['startDate'] <= date) & (era_df['endDate'] > date)].president.values[0]

    # source
    source_tag = doc.find('.//dflt:note[@type="source"]',ns)
    if source_tag is not None:
        source = " ".join(ET.tostring(source_tag, encoding='unicode', method='text').split())
    else:
        source = None

    # title -includes removing note tag!
    head_tag = doc.find('./dflt:head', ns)
    child_note_tags = head_tag.findall('./dflt:note', ns)

    for note_tag in child_note_tags:
        head_tag.remove(note_tag)

    title = " ".join(ET.tostring(head_tag, encoding='unicode', method='text').split())

    # city
    place_tag = doc.find('.//dflt:placeName',ns)
    if place_tag is not None:
        txt = "".join(place_tag.itertext())
        txt = " ".join(txt.split())
        txt = " ".join(txt.split(',')[0].split())
        city = city_lookup_dict[txt]
    else:
        city = None

    # person_sentby
    person_sentby = []
    person_sentby_dict_list = []

    for pers_tag in doc.findall('.//dflt:persName[@type="from"]',ns):
        if pers_tag is not None: 
            if 'corresp' in pers_tag.attrib:
                if pers_tag.attrib['corresp'][0]=='#':
                    person_id = pers_tag.attrib['corresp'][1:]
                else:
                    person_id = pers_tag.attrib['corresp']
                person_id = volume + '_' + person_id
                person_name = person_id_lookup_dict.get(person_id,None)
                if person_name:
                    person_sentby.append(person_name)
                    person_sentby_dict_list.append({'person_name':person_name,'sent':id_to_text})
            else:
                txt = (" ".join(pers_tag.itertext()))
                txt = " ".join(txt.split())
                person_sentby.append(txt)

    #docs[0].findall('.//dflt:list',ns)[0].attrib #list -not included yet-

    signed_person_tag = doc.find('.//dflt:signed//dflt:persName',ns)
    if signed_person_tag is not None:
        if 'corresp' in signed_person_tag.attrib:
            person_id = signed_person_tag.attrib['corresp'][1:]
            if signed_person_tag.attrib['corresp'][0]=='#':
                person_id = signed_person_tag.attrib['corresp'][1:]
            else:
                person_id = signed_person_tag.attrib['corresp']
            person_id = volume + '_' + person_id
            person_name = person_id_lookup_dict.get(person_id,None)
            if person_name:
                person_sentby.append(person_name)
                person_sentby_dict_list.append({'person_name':person_name,'sent':id_to_text})
        else:
            txt = (" ".join(signed_person_tag.itertext()))
            txt = " ".join(txt.split())
            person_sentby.append(txt)

    # person_sentto
    person_sentto = []
    person_sentto_dict_list = []

    for pers_tag in doc.findall('.//dflt:persName[@type="to"]',ns):
        if pers_tag is not None:
            if 'corresp' in pers_tag.attrib:
                if pers_tag.attrib['corresp'][0]=='#':
                    person_id = pers_tag.attrib['corresp'][1:]
                else:
                    person_id = pers_tag.attrib['corresp']
                person_id = volume + '_' + person_id
                person_name = person_id_lookup_dict.get(person_id,None)
                if person_name:
                    person_sentto.append(person_name)
                    person_sentto_dict_list.append({'person_name':person_name,'received':id_to_text})
            else:
                txt = (" ".join(pers_tag.itertext()))
                txt = " ".join(txt.split())
                person_sentto.append(txt)

    #docs[0].findall('.//dflt:list[@type="to"]',ns)[0].attrib # list -not included yet-


    # inst_sentby
    inst_sentby = []

    for gloss_tag in doc.findall('.//dflt:gloss[@type="from"]',ns):

        txt = (" ".join(gloss_tag.itertext()))
        txt = " ".join(txt.split())
        inst_sentby.append(txt)

    # inst_sentto
    inst_sentto = []

    for gloss_tag in doc.findall('.//dflt:gloss[@type="to"]',ns):

        txt = (" ".join(gloss_tag.itertext()))
        txt = " ".join(txt.split())
        inst_sentto.append(txt)


    # person_mentioned -includes removing note tag!
    person_mentioned = set()
    person_mentioned_dict_list = []

    notes_parent_tags = doc.findall('.//dflt:note/..',ns)

    for parent_tag in notes_parent_tags:

        for note_tag in parent_tag.findall('./dflt:note',ns):
            parent_tag.remove(note_tag)


    pers_tags = doc.findall('.//dflt:persName[@corresp]',ns)
    for temp_tag in pers_tags:
        if temp_tag.attrib['corresp'][0]=='#':
            person_id = temp_tag.attrib['corresp'][1:]
        else:
            person_id = temp_tag.attrib['corresp']
        person_id = volume + '_' + person_id
        person_name = person_id_lookup_dict.get(person_id,None)
        if person_name:
            person_mentioned.add(person_name)
            person_mentioned_dict_list.append({'person_name':person_name,'mentioned_in':id_to_text})


    # inst_mentioned -includes removing note tag!
    instution_mentioned = set()
    institution_mentioned_dict_list = []

    inst_tags = doc.findall('.//dflt:gloss[@target]',ns)
    for temp_tag in inst_tags:
        if temp_tag.attrib['target'][0]=='#':
            term_id = temp_tag.attrib['target'][1:]
        else:
            term_id = temp_tag.attrib['target']
        term_id = volume + '_' + term_id
        term_description_set = institution_id_lookup_dict.get(term_id,None)
        if term_description_set:
            instution_mentioned.add(term_description_set)
            institution_mentioned_dict_list.append({'description_set':term_description_set,'mentioned_in':id_to_text})


    # free text
    free_text = ""

    tag_list = doc.findall('./*',ns)
    
    # find free text's start and end elements
    lidx,ridx = 0,0

    for idx,tag in enumerate(tag_list):
        if tag.tag not in not_text_tags:
            lidx = idx
            break
    
    for idx,tag in enumerate(tag_list[::-1]):
        if tag.tag in text_tags:
            ridx = len(tag_list)-1-idx
            break
    
    # remove all <note> in free text
    notes_parent_tags = doc.findall('.//dflt:note/..',ns)

    for parent_tag in notes_parent_tags:

        for note_tag in parent_tag.findall('./dflt:note',ns):
            parent_tag.remove(note_tag)

    # join free text pieces
    for f_tag in tag_list[lidx:ridx+1]:
        free_text += " ".join("".join(f_tag.itertext()).split()) + " "
    
    # if after all, free text is still "" represent document with - to deal with nan values later.
    if free_text=="":
        free_text = "-"
    
    # compute string measures (lexical richness, polarity, token count)
    spacy_doc = nlp(free_text)
    lex = LexicalRichness(free_text)
    txt_len = len(spacy_doc)
    subj = round(spacy_doc._.blob.subjectivity,2)
    pol = round(spacy_doc._.blob.polarity,2)
    ttr = round(lex.ttr,2)
    cttr = round(lex.cttr,2)

    doc_dict = {'id_to_text':id_to_text,'volume':volume,'subtype':subtype,
                    'date':date,'year':year,'title':title,
                    'source':source,'person_sentby':person_sentby,'person_sentto':person_sentto,
                    'city':city,'era':era,'inst_sentby':inst_sentby,
                    'inst_sentto':inst_sentto,'person_mentioned':person_mentioned,
                    'inst_mentioned':instution_mentioned,'text':free_text,
                    'txt_len':txt_len,'subj':subj,'pol':pol,'ttr':ttr,'cttr':cttr,
                    }

    
    return person_sentby_dict_list, person_sentto_dict_list, person_mentioned_dict_list, institution_mentioned_dict_list ,doc_dict


# city lookup table for unification
with open(tables_path+'city_lookup_dict.json', 'r') as f:
    city_lookup_dict = json.load(f)

# person id to unified name lookup table
new_unified_person_df = pd.read_parquet(tables_path+'new_unified_person_df_final.parquet')

person_id_lookup_dict = {} # 'id':'corrected'
for _, row in new_unified_person_df.iterrows():

    for id in row['id_list']:
        if id not in person_id_lookup_dict:
            person_id_lookup_dict[id] = row['name_set']


# term id to unified name lookup table
new_unified_institution_df = pd.read_parquet(tables_path+'new_unified_institution_df.parquet')

institution_id_lookup_dict = {} # 'id':'corrected'
for _, row in new_unified_institution_df.iterrows():

    for id in row['id_list']:
        if id not in institution_id_lookup_dict:
            institution_id_lookup_dict[id] = row['description_set']

# defining useful tag lists for free text's extraction
not_text_tags = ['{http://www.tei-c.org/ns/1.0}head',
                '{http://www.tei-c.org/ns/1.0}opener',
                '{http://www.tei-c.org/ns/1.0}dateline',
                '{http://www.tei-c.org/ns/1.0}note',
                '{http://www.tei-c.org/ns/1.0}table',]
text_tags = ['{http://www.tei-c.org/ns/1.0}p',
            '{http://www.tei-c.org/ns/1.0}list']

era_df = pd.read_csv('tables/era.csv')
era_df['startDate'] = era_df['startDate'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))
era_df['endDate'] = era_df['endDate'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))

#doc_df = pd.DataFrame(columns=['id_to_text','volume','subtype','date','year','title','source','person_sentby',
#                                  'person_sentto','city','era','inst_sentby','inst_sentto',
#                                  'person_mentioned','inst_mentioned','text',
#                                  'txt_len','subj','pol','ttr','cttr'])

#person_sentby_df = pd.DataFrame(columns=['person_name','sent'])
#person_sentto_df = pd.DataFrame(columns=['person_name','received'])
#person_mentioned_df = pd.DataFrame(columns=['person_name','mentioned_in'])

#instution_mentioned_df = pd.DataFrame(columns=['description_set','mentioned_in'])

global_person_sentby_list = []
global_person_sentto_list = []
global_person_mentioned_list = []
global_institution_mentioned_list = []
global_doc_list = []

# only use documents within this years
start_year, end_year = 1980, 1988

@ray.remote
def extract_volume(file):
    file_start_year = int(file[12:16])

    if file_start_year >= start_year and file_start_year<=end_year:
        volume = file[8:-4]

        tree = ET.parse(file)
        root = tree.getroot()

        docs = root.findall('./dflt:text/dflt:body//dflt:div[@type="document"]', ns)
        for doc in docs:
            person_sentby_dict_list, person_sentto_dict_list, person_mentioned_dict_list, institution_mentioned_dict_list ,doc_dict = extract_document(doc, volume)
            global_person_sentby_list += person_sentby_dict_list
            global_person_sentto_list += person_sentto_dict_list
            global_person_mentioned_list += person_mentioned_dict_list
            global_institution_mentioned_list += institution_mentioned_dict_list
            global_doc_list.append(doc_dict)


for file in tqdm(glob.glob('volumes/frus*')):
    file_start_year = int(file[12:16])

    if file_start_year >= start_year and file_start_year<=end_year:
        volume = file[8:-4]

        tree = ET.parse(file)
        root = tree.getroot()

        docs = root.findall('./dflt:text/dflt:body//dflt:div[@type="document"]', ns)
        for doc in docs:
            person_sentby_dict_list, person_sentto_dict_list, person_mentioned_dict_list, institution_mentioned_dict_list ,doc_dict = extract_document(doc, volume)
            global_person_sentby_list += person_sentby_dict_list
            global_person_sentto_list += person_sentto_dict_list
            global_person_mentioned_list += person_mentioned_dict_list
            global_institution_mentioned_list += institution_mentioned_dict_list
            global_doc_list.append(doc_dict)


ray.shutdown()

'''doc_df = pd.DataFrame(global_doc_list)
person_sentby_df = pd.DataFrame(global_person_sentby_list)
person_sentto_df = pd.DataFrame(global_person_sentto_list)
person_mentioned_df = pd.DataFrame(global_person_mentioned_list)
instution_mentioned_df = pd.DataFrame(global_institution_mentioned_list)

doc_df.to_csv(tables_path+'doc.csv')
person_sentby_df.to_csv(tables_path+'person_sentby.csv')
person_sentto_df.to_csv(tables_path+'person_sentto.csv')

person_mentioned_df = person_mentioned_df[['person_name','mentioned_in']].drop_duplicates().reset_index(drop=True)
person_mentioned_df.to_csv(tables_path+'person_mentioned.csv')

instution_mentioned_df = instution_mentioned_df[['description_set','mentioned_in']].drop_duplicates().reset_index(drop=True)
instution_mentioned_df.to_csv(tables_path+'instution_mentioned.csv')'''

100%|██████████| 543/543 [07:32<00:00,  1.20it/s]


KeyError: "None of [Index(['description_set', 'mentioned_in'], dtype='object')] are in the [columns]"

In [7]:
global_institution_mentioned_list

[]

In [13]:
# ENTITY("COUNTRY_MENTIONED")

ne_df = pd.read_csv('tables/columbia_ner_annotations.csv')
country_df = pd.read_csv('tables/tables_52_88/country.csv')
doc_df = pd.read_csv('tables/tables_52_88/doc.csv')

filtered_ne_df = ne_df[ne_df['itemLabel'].apply(lambda x: x in country_df['countryLabel'].values)]

# helper
def reformat_file_name(temp_str):
    temp_str = temp_str[:-8]

    d_index = temp_str.rfind('d')
    
    return temp_str[:d_index] + '_' + temp_str[d_index:]


country_mentioned_df = filtered_ne_df[['file','itemLabel']].drop_duplicates()
country_mentioned_df['file'] = country_mentioned_df['file'].apply(lambda x: reformat_file_name(x))
country_mentioned_df.rename(columns={'file':'id_to_text','itemLabel':'countryLabel'},inplace=True)

# only for part of data (69-76). not needed when whole data.
country_mentioned_df = country_mentioned_df[country_mentioned_df['id_to_text'].apply(lambda x: x in doc_df['id_to_text'].values)]

country_mentioned_df.reset_index(inplace=True,drop=True)
country_mentioned_df.to_csv('tables/tables_52_88/country_mentioned.csv')

In [9]:
pd.DataFrame([{'a':1,'b':2},{'a':3,'b':4}])

Unnamed: 0,a,b
0,1,2
1,3,4
