In [1]:
import re
import glob
import spacy
import pandas as pd
from tqdm import tqdm
import xml.etree.ElementTree as ET

ns = {'xml': 'http://www.w3.org/XML/1998/namespace',
      'dflt': 'http://www.tei-c.org/ns/1.0',
      'frus':'http://history.state.gov/frus/ns/1.0',
      'xi':'http://www.w3.org/2001/XInclude'
      }

nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


frus1964-68v10.xml line 197 on redaction format
<p>Bracketed insertions are also used to indicate omitted text that deals with an
                    unrelated subject (in roman type) or that remains classified after
                    declassification review (in italic type). The amount of material not
                    declassified has been noted by indicating the number of lines or pages of source
                    text that were omitted. Entire documents withheld for declassification purposes
                    have been accounted for and are listed by headings, source notes, and number of
                    pages not declassified in their chronological place. All brackets that appear in
                    the source text are so identified by footnotes.</p>

### PART 1: EXTRACT REDACTIONS

In [2]:
def extract_redaction(doc, volume):

    global redaction_df

    # id
    id_to_text = volume + '_' + doc.attrib['{http://www.w3.org/XML/1998/namespace}id']

    # redaction text and amount
    for el in doc.findall('.//dflt:hi[@rend="italic"]', ns):
        temp_txt = "".join(el.itertext())
        temp_txt = " ".join(temp_txt.split()) # remove \n
        if re.search('not declassified',temp_txt): # if redaction identified

            chunks = []
            doc = nlp(temp_txt)
            for chunk in doc.noun_chunks:
                chunks.append("".join(chunk.text))
                

            redaction_df = pd.concat((redaction_df, pd.DataFrame({'id_to_text':[id_to_text],'raw_text':[temp_txt],'detected_chunks':[chunks]
                                                    })),ignore_index=True)


redaction_df = pd.DataFrame(columns=['id_to_text','raw_text','detected_chunks'])


# only use documents within this years
volume_root = 'frus1969-76'

for file in glob.glob('volumes/'+volume_root+'*'):
#for file in glob.glob('volumes/'+volume_root+'v30.xml'):
    volume = file[8:-4]

    tree = ET.parse(file)
    root = tree.getroot()

    docs = root.findall('./dflt:text/dflt:body//dflt:div[@type="document"]', ns)

    for doc in docs:
        extract_redaction(doc,volume)

### PART 2: PROCESS EXTRACTED REDACTIONS

In [3]:
# count redaction type's frequencies
type_dict = {}

for idx,temp_text in enumerate(tqdm(redaction_df['raw_text'])):

    temp_text = temp_text.replace('½', '') # this symbol is problematic
    doc = nlp(temp_text)

    for token in doc:
        if token.pos_ == 'NOUN':
            cnt = type_dict.get(token.lemma_,0)+1
            type_dict[token.lemma_] = cnt

100%|██████████| 8310/8310 [00:32<00:00, 254.54it/s]


In [4]:
type_col = []
amount_col = []

for idx,temp_text in enumerate(tqdm(redaction_df['raw_text'])):

    # handle paranthesis
    result = re.findall('\((.*?)\)',temp_text)
    if len(result)==1:
        temp_text = result[0]
    elif len(result)>1: # for debugging purposes
        print(f'ALERT about paranthesis use: {idx},{temp_text}')

    temp_text = temp_text.replace('½', '') # this symbol is problematic
    doc = nlp(temp_text)

    # select most relevant noun chunk
    chunk = None
    if len(list(doc.noun_chunks))==1:
        chunk = list(doc.noun_chunks)[0]
    elif len(list(doc.noun_chunks))>1:
        max_count = -1 

        for temp_chunk in doc.noun_chunks:
            for token in temp_chunk:
                if token.pos_ == 'NOUN':
                    temp_count = type_dict[token.lemma_]

                    if temp_count > max_count:
                        max_count = temp_count
                        chunk = temp_chunk

    # separate type and amount, if possible
    if chunk is None:
        type_col.append(None)
        amount_col.append(None)
    else:       
        type_ = ''
        amount = ''
        for token in chunk:
            if token.like_num:
                amount += token.text
            elif token.pos_ == 'NOUN' or token.pos_ == 'PROPN':
                type_ += token.lemma_   
        
        type_col.append(type_)
        amount_col.append(amount)

redaction_df['type_col'] = type_col
redaction_df['amount_col'] = amount_col

def aux(x):
    try:
        return float(eval(x))
    except:
        return None
    
redaction_df['amount_col'] = redaction_df['amount_col'].apply(lambda x: aux(x))


redaction_df.reset_index(drop=False,inplace=True)
redaction_df.rename(columns={'index':'redaction_id'},inplace=True)
redaction_df.to_parquet('tables/redaction_69_76.parquet')

100%|██████████| 8310/8310 [00:33<00:00, 249.97it/s]
