In [27]:
import re
import glob
import spacy
import math
import json
import pandas as pd
import numpy as np
from datetime import datetime
import xml.etree.ElementTree as ET

ns = {'xml': 'http://www.w3.org/XML/1998/namespace',
      'dflt': 'http://www.tei-c.org/ns/1.0',
      'frus':'http://history.state.gov/frus/ns/1.0',
      'xi':'http://www.w3.org/2001/XInclude'
      }

nlp = spacy.load('en_core_web_sm')

frus1964-68v10.xml line 197 on redaction format
<p>Bracketed insertions are also used to indicate omitted text that deals with an
                    unrelated subject (in roman type) or that remains classified after
                    declassification review (in italic type). The amount of material not
                    declassified has been noted by indicating the number of lines or pages of source
                    text that were omitted. Entire documents withheld for declassification purposes
                    have been accounted for and are listed by headings, source notes, and number of
                    pages not declassified in their chronological place. All brackets that appear in
                    the source text are so identified by footnotes.</p>

In [2]:
def extract_redaction(doc, volume):

    global redaction_df

    # id
    id_to_text = volume + '_' + doc.attrib['{http://www.w3.org/XML/1998/namespace}id']

    # redaction text and amount
    for el in doc.findall('.//dflt:hi[@rend="italic"]', ns):
        temp_txt = "".join(el.itertext())
        temp_txt = " ".join(temp_txt.split()) # remove \n
        if re.search('not declassified',temp_txt): # if redaction identified

            chunks = []
            doc = nlp(temp_txt)
            for chunk in doc.noun_chunks:
                chunks.append("".join(chunk.text))
                

            redaction_df = pd.concat((redaction_df, pd.DataFrame({'id_to_text':[id_to_text],'text':[temp_txt],'amount':[chunks]
                                                    })),ignore_index=True)


redaction_df = pd.DataFrame(columns=['id_to_text','text','amount'])


# only use documents within this years
volume_root = 'frus1969-76'

for file in glob.glob('volumes/'+volume_root+'*'):
#for file in glob.glob('volumes/'+volume_root+'v30.xml'):
    volume = file[8:-4]

    tree = ET.parse(file)
    root = tree.getroot()

    docs = root.findall('./dflt:text/dflt:body//dflt:div[@type="document"]', ns)

    for doc in docs:
        extract_redaction(doc,volume)


redaction_df.to_parquet('tables/redaction_69_76.parquet')

### below is experimental.

In [28]:
redaction_df = pd.read_parquet('tables/redaction_69_76.parquet')

In [50]:
redaction_df.sample(20)

Unnamed: 0,id_to_text,text,amount
1011,frus1969-76ve15p2Ed2_d155,less than 1 line not declassified,[less than 1 line]
3545,frus1969-76ve16_d42,2 lines not declassified,[2 lines]
2524,frus1969-76ve10_d592,text not declassified,[text]
4484,frus1969-76v21_d43,name not declassified,[name]
4025,frus1969-76ve16_d130,dollar amount not declassified,[dollar amount]
2559,frus1969-76ve06_d137,text not declassified,[text]
2010,frus1969-76ve09p1_d93,text not declassified,[text]
7426,frus1969-76v26_d254,3 lines not declassified,[3 lines]
5233,frus1969-76v35_d51,1 paragraph (7½ lines) not declassified,"[1 paragraph, (7½ lines]"
8230,frus1969-76v25_d208,less than 1 line not declassified,[less than 1 line]


In [57]:
# if () occurs, use value inside for detection
# if more than one noun chunks occur, use the most frequent one
# use token.like_num

In [119]:
import spacy
from tqdm import tqdm

nlp = spacy.load('en_core_web_sm')

In [120]:
# count redaction type's frequencies
type_dict = {}

for idx,temp_text in enumerate(tqdm(redaction_df['text'])):

    temp_text = temp_text.replace('½', '') # this symbol is problematic
    doc = nlp(temp_text)

    for token in doc:
        if token.pos_ == 'NOUN':
            cnt = type_dict.get(token.lemma_,0)+1
            type_dict[token.lemma_] = cnt

100%|██████████| 8310/8310 [00:32<00:00, 257.88it/s]


In [152]:
type_col = []
amount_col = []

for idx,temp_text in enumerate(tqdm(redaction_df['text'])):

    # handle paranthesis
    result = re.findall('\((.*?)\)',temp_text)
    if len(result)==1:
        temp_text = result[0]
    elif len(result)>1: # for debugging purposes
        print(f'ALERT about paranthesis use: {idx},{temp_text}')

    temp_text = temp_text.replace('½', '') # this symbol is problematic
    doc = nlp(temp_text)

    # select most relevant noun chunk
    chunk = None
    if len(list(doc.noun_chunks))==1:
        chunk = list(doc.noun_chunks)[0]
    elif len(list(doc.noun_chunks))>1:
        max_count = -1 

        for temp_chunk in doc.noun_chunks:
            for token in temp_chunk:
                if token.pos_ == 'NOUN':
                    temp_count = type_dict[token.lemma_]

                    if temp_count > max_count:
                        max_count = temp_count
                        chunk = temp_chunk

    # separate type and amount, if possible
    if chunk is None:
        type_col.append(None)
        amount_col.append(None)
    else:       
        type_ = ''
        amount = ''
        for token in chunk:
            if token.like_num:
                amount += token.text
            elif token.pos_ == 'NOUN' or token.pos_ == 'PROPN':
                type_ += token.lemma_   
        
        type_col.append(type_)
        amount_col.append(amount)

redaction_df['type_col'] = type_col
redaction_df['amount_col'] = amount_col

100%|██████████| 8310/8310 [00:33<00:00, 246.23it/s]


In [193]:
redaction_df['type_col'].value_counts()

line                        5420
text                        1085
dollaramount                 775
name                         591
number                        99
restriction                   92
codeword                      59
page                          39
datum                          8
place                          6
documentnumber                 6
chart                          6
sourcetext                     4
cablenumber                    4
messagenumber                  4
table                          4
classification                 3
time                           3
dollar                         3
Name                           3
paragraph                      3
codename                       2
linecomment                    2
filename                       2
second                         2
date                           2
information                    2
thaline                        1
title                          1
Creteline                      1
V.line    

In [236]:
def aux(x):
    try:
        return int(eval(x))
    except:
        return x
    
temp_col = redaction_df['amount_col'].apply(lambda x: aux(x))

In [240]:
pd.Series(map(lambda x: isinstance(x,int),temp_col)).value_counts()

True     5464
False    2846
dtype: int64

In [85]:
import copy
copy_type_dict = copy.deepcopy(type_dict)
copy_copy_type_dict = copy.deepcopy(type_dict)
min_type_count = 5

for key in copy_type_dict:
    
    if copy_copy_type_dict[key] < min_type_count:
        del copy_copy_type_dict[key]

In [190]:
pd.set_option('display.max_rows', 60)