In [1]:
import re
import glob
import spacy
import math
import json
import pandas as pd
import numpy as np
from datetime import datetime
import xml.etree.ElementTree as ET

ns = {'xml': 'http://www.w3.org/XML/1998/namespace',
      'dflt': 'http://www.tei-c.org/ns/1.0',
      'frus':'http://history.state.gov/frus/ns/1.0',
      'xi':'http://www.w3.org/2001/XInclude'
      }

nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


frus1964-68v10.xml line 197 on redaction format
<p>Bracketed insertions are also used to indicate omitted text that deals with an
                    unrelated subject (in roman type) or that remains classified after
                    declassification review (in italic type). The amount of material not
                    declassified has been noted by indicating the number of lines or pages of source
                    text that were omitted. Entire documents withheld for declassification purposes
                    have been accounted for and are listed by headings, source notes, and number of
                    pages not declassified in their chronological place. All brackets that appear in
                    the source text are so identified by footnotes.</p>

In [2]:
def extract_redaction(doc, volume):

    global redaction_df

    # id
    id_to_text = volume + '_' + doc.attrib['{http://www.w3.org/XML/1998/namespace}id']

    # redaction text and amount
    for el in doc.findall('.//dflt:hi[@rend="italic"]', ns):
        temp_txt = "".join(el.itertext())
        temp_txt = " ".join(temp_txt.split()) # remove \n
        if re.search('not declassified',temp_txt): # if redaction identified

            chunks = []
            doc = nlp(temp_txt)
            for chunk in doc.noun_chunks:
                chunks.append("".join(chunk.text))
                

            redaction_df = pd.concat((redaction_df, pd.DataFrame({'id_to_text':[id_to_text],'text':[temp_txt],'amount':[chunks]
                                                    })),ignore_index=True)


redaction_df = pd.DataFrame(columns=['id_to_text','text','amount'])


# only use documents within this years
volume_root = 'frus1969-76'

for file in glob.glob('volumes/'+volume_root+'*'):
#for file in glob.glob('volumes/'+volume_root+'v30.xml'):
    volume = file[8:-4]

    tree = ET.parse(file)
    root = tree.getroot()

    docs = root.findall('./dflt:text/dflt:body//dflt:div[@type="document"]', ns)

    for doc in docs:
        extract_redaction(doc,volume)


redaction_df.to_parquet('tables/redaction_69_76.parquet')

In [3]:
redaction_df

Unnamed: 0,id_to_text,text,amount
0,frus1969-76v14_d86,1 line of source text not declassified,"[1 line, source text]"
1,frus1969-76v14_d92,name not declassified,[name]
2,frus1969-76v14_d99,name not declassified,[name]
3,frus1969-76v14_d118,name not declassified,[name]
4,frus1969-76v14_d122,name not declassified,[name]
...,...,...,...
8305,frus1969-76v25_d412,2½ lines not declassified,[2½ lines]
8306,frus1969-76v25_d412,less than 1 line not declassified,[less than 1 line]
8307,frus1969-76v25_d412,less than 1 line not declassified,[less than 1 line]
8308,frus1969-76v25_d412,3 lines not declassified,[3 lines]
