In [1]:
import re
import glob
import spacy
import pandas as pd
from tqdm import tqdm
import xml.etree.ElementTree as ET
import ray

# define namespaces in FRUS schema
ns = {'xml': 'http://www.w3.org/XML/1998/namespace',
      'dflt': 'http://www.tei-c.org/ns/1.0',
      'frus': 'http://history.state.gov/frus/ns/1.0',
      'xi': 'http://www.w3.org/2001/XInclude'
      }

# define path to save extracted files
tables_path = 'tables/tables_52_88_demo/'

# only use documents within these years
start_year, end_year = 1952, 1958

nlp = spacy.load('en_core_web_sm')

### PART 1: EXTRACT REDACTIONS

In [2]:
# helper function 1 step 1
# helper function for parsing redactions in a document
@ray.remote
def extract_redaction(doc, volume):

    doc_redaction_list = []

    # id
    id_to_text = volume + '_' + doc.attrib['{http://www.w3.org/XML/1998/namespace}id']

    # redaction text and amount
    for el in doc.findall('.//dflt:hi[@rend="italic"]', ns):
        temp_txt = "".join(el.itertext())
        temp_txt = " ".join(temp_txt.split()) # remove \n
        if re.search('not declassified',temp_txt): # if redaction identified

            chunks = []
            doc = nlp(temp_txt)
            for chunk in doc.noun_chunks:
                chunks.append("".join(chunk.text))
                
            doc_redaction_list.append({'id_to_text':id_to_text,'raw_text':temp_txt,'detected_chunks':chunks})
    
    return doc_redaction_list

In [3]:
# initialize parallel operation
ray.init(num_cpus=13)

# variables to merge information from all volumes
global_redaction_list = []

# main loop over all volumes
for file in glob.glob('volumes/frus*'):
    file_start_year = int(file[12:16])
    
    # if volume date is within specified dates
    if file_start_year >= start_year and file_start_year<=end_year:

        volume = file[8:-4]

        tree = ET.parse(file)
        root = tree.getroot()

        # find all documents in volume
        docs = root.findall('./dflt:text/dflt:body//dflt:div[@type="document"]', ns)

        futures = [extract_redaction.remote(doc, volume) for doc in docs]
        result_tuple_list = ray.get(futures)

        global_redaction_list += sum(result_tuple_list, [])
 
# close parallel processes
ray.shutdown()

# convert results to pd dataframe
redaction_df = pd.DataFrame(global_redaction_list)



### PART 2: PROCESS EXTRACTED REDACTIONS

In [5]:
# count redaction type's frequencies
type_dict = {}

for idx,temp_text in enumerate(tqdm(redaction_df['raw_text'])):

    # this symbol is problematic, remove it
    temp_text = temp_text.replace('½', '')
    
    doc = nlp(temp_text)

    for token in doc:
        if token.pos_ == 'NOUN':
            cnt = type_dict.get(token.lemma_,0)+1
            type_dict[token.lemma_] = cnt

100%|██████████| 4517/4517 [00:18<00:00, 247.53it/s]


In [9]:
type_col = []
amount_col = []

for idx,temp_text in enumerate(tqdm(redaction_df['raw_text'])):

    # find paranthesis in raw redaction text, if exist
    result = re.findall('\((.*?)\)',temp_text)
    # only use first paranthesis further
    if len(result)==1:
        temp_text = result[0]
    # resolve multi-paranthesis cases by hand.
    # prints these for debugging purposes
    elif len(result)>1:
        print(f'Untidy reduction format. Multi-paranthesis use in row {idx}: {temp_text}')

    # this symbol is problematic, remove it
    temp_text = temp_text.replace('½', '') 
    doc = nlp(temp_text)

    # select most relevant noun chunk
    chunk = None
    if len(list(doc.noun_chunks))==1:
        chunk = list(doc.noun_chunks)[0]
    elif len(list(doc.noun_chunks))>1:
        max_count = -1 

        for temp_chunk in doc.noun_chunks:
            for token in temp_chunk:
                if token.pos_ == 'NOUN':
                    temp_count = type_dict[token.lemma_]

                    if temp_count > max_count:
                        max_count = temp_count
                        chunk = temp_chunk

    # separate type and amount for selected noun chunk, if possible
    if chunk is None:
        type_col.append(None)
        amount_col.append(None)
    else:       
        type_ = ''
        amount = ''
        for token in chunk:
            if token.like_num:
                amount += token.text
            elif token.pos_ == 'NOUN' or token.pos_ == 'PROPN':
                type_ += token.lemma_   
        
        type_col.append(type_)
        amount_col.append(amount)

redaction_df['type_col'] = type_col
redaction_df['amount_col'] = amount_col

# helper function 1 step 2
# converts a str to float if possible
def convert_str2float(x):
    try:
        return float(eval(x))
    except:
        return None
    
redaction_df['amount_col'] = redaction_df['amount_col'].apply(lambda x: convert_str2float(x))


redaction_df.reset_index(drop=False,inplace=True)
redaction_df.rename(columns={'index':'redaction_id'},inplace=True)
redaction_df.to_parquet(tables_path+'redaction.parquet')

 34%|███▍      | 1551/4517 [00:06<00:11, 256.08it/s]

Ragged reduction format. Multi-paranthesis use in 1520,1 paragraph (13 lines of source text) and footnote (4 lines of source text) not declassified


 36%|███▌      | 1634/4517 [00:06<00:11, 257.97it/s]

Ragged reduction format. Multi-paranthesis use in 1591,item (b) (47 words) not declassified; President’s comments on item (b) from memorandum by Goodpaster attached to Wilson’s memorandum not declassified


 41%|████      | 1842/4517 [00:07<00:10, 254.26it/s]

Ragged reduction format. Multi-paranthesis use in 1808,Subparagraph (b) (2½ lines of source text) not declassified
Ragged reduction format. Multi-paranthesis use in 1841,2 paragraphs (14½ lines of source text) and footnote (6 lines of text) not declassified


 78%|███████▊  | 3525/4517 [00:13<00:03, 260.11it/s]

Ragged reduction format. Multi-paranthesis use in 3476,3 paragraphs (29 lines of source text) and footnote (3 lines of text) not declassified


 80%|████████  | 3630/4517 [00:13<00:03, 251.34it/s]

Ragged reduction format. Multi-paranthesis use in 3591,Subparagraph (3) (1½ lines of source text) not declassified


 96%|█████████▌| 4335/4517 [00:16<00:00, 248.05it/s]

Ragged reduction format. Multi-paranthesis use in 4308,Numbered paragraph (6) (61/2 lines of source text) not declassified
Ragged reduction format. Multi-paranthesis use in 4357,Paragraph (C) (14½ lines of source text) not declassified


 98%|█████████▊| 4414/4517 [00:16<00:00, 243.82it/s]

Ragged reduction format. Multi-paranthesis use in 4377,Subparagraphs (d) and (e) (81/2 lines of source text) not declassified
Ragged reduction format. Multi-paranthesis use in 4410,paragraphs 18–a (4 lines of source text) and 18–a–l (31/2 lines of source text) not declassified
Ragged reduction format. Multi-paranthesis use in 4413,paragraphs 19–a (11/2 lines of source text) and 19–a–l (31/2 lines of source text) not declassified


100%|██████████| 4517/4517 [00:17<00:00, 260.15it/s]
