In [49]:
import os
import re
import json
from dataScience.src.text_handling.process import preprocess
from tqdm import tqdm

In [50]:
corpus_path = "./../../../../../corpus/dod_corp/"

In [3]:
for i, path in enumerate(os.listdir(corpus_path)):
    print(path)
    if i == 10:
        break

DoDI 1000.04.json
DoDM 1348.33 Volume 1 CH 2.json
DoDI 1444.02 Volume 3 CH 1.json
DoDI 1400.25 Volume 1403.json
DoDI 1005.14 CH 1.json
DoDI 1342.15 CH 1.json
DoDI 1402.03 VOLUME 2 CH 2.json
DoDI 1300.25 CH 1.json
DoDI 3002.03 CH 1.json
DoDI 1341.09.json
DoDI 5040.02 CH 2.json


In [53]:
paths = [path for path in os.listdir(corpus_path)]

In [55]:
doc_types = sorted(set([path.split()[0] for path in paths]))
doc_types

['DoD', 'DoDD', 'DoDI', 'DoDM']

In [62]:
doc_nums = sorted(set([re.findall("\d{4}", path, re.DOTALL)[0] for path in paths]))

In [76]:
doc_subs = sorted(set([re.findall("\.\d+", path, re.DOTALL)[0][1:] for path in paths]))

In [114]:
def find_meta(path):
    doc_type = path.split()[0]
    doc_number = re.findall("\d{4}", path,  re.DOTALL)[0]
    doc_series = doc_number[0]
    doc_subseries = doc_number[:2]
    doc_issuance = re.findall("\.\d+", path, re.DOTALL)[0][1:]
    
    path_l = path.lower()
    doc_vol = re.findall("volume\ \d{1,}", path_l, re.DOTALL)
    doc_vol = doc_vol[0].split()[-1] if doc_vol else None
    doc_chp = re.findall("ch\ \d{1,}", path_l, re.DOTALL)
    doc_chp = doc_chp[0].split()[-1] if doc_chp else None
    
    doc_meta = {
        "path": path,
        "type": doc_type,
        "series": doc_series,
        "subseries": doc_subseries,
        "issuance": doc_issuance,
        "number": doc_number,
        "volume": doc_vol,
        "chapter": doc_chp
    }
    return doc_meta

In [116]:
idx = 150
find_meta(paths[idx])

{'path': 'DoDD 5124.10.json',
 'type': 'DoDD',
 'series': '5',
 'subseries': '51',
 'issuance': '10',
 'number': '5124',
 'volume': None,
 'chapter': None}

In [122]:
def directory_parse(corpus_path):
    dir_rank = {}
    for file in os.listdir(corpus_path):
        metadata = find_meta(file)
        a = metadata["type"]
        b = metadata["series"]
        c = metadata["subseries"]
        d = metadata["number"]
        e = metadata["issuance"]

        if a not in dir_rank: dir_rank[a] = {}
        if b not in dir_rank[a]: dir_rank[a][b] = {}
        if c not in dir_rank[a][b]: dir_rank[a][b][c] = {}
        if d not in dir_rank[a][b][c]: dir_rank[a][b][c][d] = {}
        if e not in dir_rank[a][b][c][d]: dir_rank[a][b][c][d][e] = []

        dir_rank[a][b][c][d][e].append(metadata["path"])
    return dir_rank

In [123]:
dir_parse = directory_parse(corpus_path)
with open("dir_parse.json", "w") as fp:
    json.dump(dir_parse, fp)

In [5]:
for path in tqdm(os.listdir(corpus_path)):
    full_path = os.path.join(corpus_path, path)

100%|██████████| 1180/1180 [00:00<00:00, 440170.64it/s]


In [6]:
data["paragraphs"][11]['par_raw_text_t']

'e . Authorizes the establishment of the Defense VI ( DVI ) Directorate and the Defense Imagery Management Operations Center ( DIMOC ) .'

In [7]:
s = data["paragraphs"][11]["par_raw_text_t"]
s

'e . Authorizes the establishment of the Defense VI ( DVI ) Directorate and the Defense Imagery Management Operations Center ( DIMOC ) .'

In [33]:
# Header formats

### PATTERN SET 1 ###
# "DoDI 1444.02 Volume 3 CH 1.json"
# [ 1. PURPOSE ] [ b. ] [ (3) ]
pattern_1 = "\d+\s?\.\s?[A-Z()& ]{3,}(?<! )"
pattern_2 = "[a-z]\s?\.\s(?=[A-Z])"
pattern_3 = "\(\s\d\s\)(?= [A-Z])"
pattern_4 = "\(\s[a-z]\s\)(?= [A-Z])"
#pattern_4 = "\[a-z]\s.(?= [A-Z])"

pattern_set_1 = [
    pattern_1,
    pattern_2,
    pattern_3
]

In [129]:
### PATTERN SET 2 ###
pattern_1 = "[A-Z]{3,}\s\d\s\:\s[A-Z ]{3,}(?<! )"
pattern_2 = "\d\.\d?\s\.\s[A-Z,()& ]{3,}(?<! )"
pattern_3 = "\ [a-z]\ \.(?=[A-Z])"
pattern_4 = "\(\s\d\s\)(?=.)(?= [A-Z])"
pattern_5 = "\(\s[a-z]\s\)(?=.)(?= [A-Z])"

pattern_set_2 = [
    pattern_1,
    pattern_2,
    pattern_3,
    pattern_4,
    pattern_5
]

In [None]:
### PATTERN SET 3 ###
pattern_1 = "[A-Z]{3,}\s\d\s\:\s[A-Z ]{3,}(?<! )"
pattern_2 = "\d\.\d?\s\.\s[A-Z,()& ]{3,}(?<! )"
pattern_3 = "\ [a-z]\ \.(?=[A-Z])"
pattern_4 = "\(\s\d\s\)(?=.)(?= [A-Z])"
pattern_5 = "\(\s[a-z]\s\)(?=.)(?= [A-Z])"

pattern_set_3 = [
    pattern_1,
    pattern_2,
    pattern_3,
    pattern_4,
]

In [46]:
class Hieharchy(object):
    def __init__(self):
        self.dictionary = {}
    
    def update(self, keys, value):
        dic = self.dictionary
        for key in keys:
            if key not in dic:
                dic[key] = {}
            dic = dic[key]
        dic["text"] = value

In [47]:
# Test documents

path = "DoDI 1444.02 Volume 3 CH 1.json"
path = "DoDI 1000.04.json"
path = "DoDM 1348.33 Volume 1 CH 2.json"
#path = "DoDI 1402.03 VOLUME 2 CH 2.json"

In [133]:
data["title"]

'Joint Medical Executive Skills Development Program'

In [135]:
doc_name_dir = {}
for path in tqdm(os.listdir(corpus_path)):
    best_data = None
    for pattern_list in [pattern_set_1, pattern_set_2]:
        pattern_str = "(" + "|".join(pattern_list) + ")"

        with open(os.path.join(corpus_path, path), "r") as fp:
            data = json.load(fp)
        
        doc_name_dir[path] = data["title"]

        new_data = Hieharchy()
        headers = ["COVER PAGE"]
        texts = []
        count = 0
        for paragraph in data["paragraphs"]:
            s = paragraph["par_raw_text_t"]
            matches = re.findall(pattern_str, s, re.DOTALL)
            match_levels = [re.findall(pattern, s, re.DOTALL) for pattern in pattern_list]
            if len(matches) > 0:
                splits = [i for i in re.split(pattern_str, s) if len(i) > 2]
                for split in splits:
                    if (split in matches) and (split not in headers):
                        count+=1
                        new_data.update(headers, ". ".join(texts))
                        i_level = [i for i, level in enumerate(match_levels) if split in level][0]
                        headers = headers[:i_level]
                        headers.append(split)
                        texts = []
                    else:
                        texts.append(split.lstrip().rstrip())
        if (pattern_list == pattern_set_2) and any([key.startswith("SECTION") for key in new_data.dictionary.keys()]):
            best_data = new_data
        elif (pattern_list == pattern_set_1) and not any([key.startswith("SECTION") for key in new_data.dictionary.keys()]):
            best_data = new_data
            
        new_folder = "./../../../../../corpus/sentparse/"
        parsed_path = ".".join(path.split(".")[:-1])+ "_parsed.json"
        parsed_path = os.path.join(new_folder, parsed_path)

    if best_data:
        with open(parsed_path, "w") as fp:
            json.dump(best_data.dictionary, fp)
            
with open(os.path.join(new_folder, "metadata.json"), "w") as fp:
    json.dump(doc_name_dir, fp)

100%|██████████| 1180/1180 [00:25<00:00, 46.40it/s]


In [42]:
def update2(input_dictionary, new_value, loc):
    new_dict = reduce(lambda x, y: {y: x}, reversed(loc), new_value)
    input_dictionary.update(new_dict)
    return input_dictionary

In [131]:
path = "DODI 6000.15.json"
for pattern_list in [pattern_set_1, pattern_set_2]:
    pattern_str = "(" + "|".join(pattern_list) + ")"

    with open(os.path.join(corpus_path, path), "r") as fp:
        data = json.load(fp)

    new_data = Hieharchy()
    headers = ["COVER PAGE"]
    texts = []
    count = 0
    for paragraph in data["paragraphs"]:
        s = paragraph["par_raw_text_t"]
        matches = re.findall(pattern_str, s, re.DOTALL)
        match_levels = [re.findall(pattern, s, re.DOTALL) for pattern in pattern_list]
        if len(matches) > 0:
            splits = [i for i in re.split(pattern_str, s) if len(i) > 2]
            for split in splits:
                if split in matches:
                    count+=1
                    new_data.update(headers, ". ".join(texts))
                    i_level = [i for i, level in enumerate(match_levels) if split in level][0]
                    headers = headers[:i_level]
                    headers.append(split)
                    print(headers)
                    texts = []
                else:
                    texts.append(split.lstrip().rstrip())
    if (pattern_list == pattern_set_2) and any([key.startswith("SECTION") for key in new_data.dictionary.keys()]):
        best_data = new_data
    elif (pattern_list == pattern_set_1) and not any([key.startswith("SECTION") for key in new_data.dictionary.keys()]):
        best_data = new_data

['1 .PURPOSE T']
['2 . APPLICABILITY T']
['3 . GLOSSARY T']
['4 . POLICY T']
['2 . MTF']
['5 . RESPONSIBILITIES']
['6 . EFFECTIVE DATE T']
['1 .ENCLOSURE']
['2 .ENCLOSURE']
['1 . AMEDDC&S']
['2 . DASD ( HOP )']
['3 . DHP']
['4 . JMESDG']
['5 . JMESDP']
['6 . MHS']
['7 . MTF']
['8 . USUHS']
['9 . VMHI']
['COVER PAGE', '( a )']
['COVER PAGE', '( a )', '( b )']
['COVER PAGE', '( a )', '( b )', '( c )']
['COVER PAGE', '( a )', '( b )', '( c )', '( d )']
['COVER PAGE', '4.2 . MTF']
['COVER PAGE', '4.2 . MTF', '( e )']
['COVER PAGE', '4.2 . MTF', '( e )', '( f )']
['COVER PAGE', '4.2 . MTF', '( e )', '( f )', '( g )']
['COVER PAGE', '4.2 . MTF', '( e )', '( f )', '( h )']
['COVER PAGE', '1.1 . AMEDDC&S']
['COVER PAGE', '1.2 . DASD ( HOP )']
['COVER PAGE', '1.3 . DHP']
['COVER PAGE', '1.4 . JMESDG']
['COVER PAGE', '1.5 . JMESDP']
['COVER PAGE', '1.6 . MHS']
['COVER PAGE', '1.7 . MTF']
['COVER PAGE', '1.8 . USUHS']
['COVER PAGE', '1.9 . VMHI']


In [44]:
class LocalCorpus(object):
    def __init__(self, directory, return_id = False, min_token_len = 3, verbose = False):
        self.directory = directory
        self.file_list = [
            os.path.join(directory, file)
            for file in os.listdir(directory)
            if file[-5:] == ".json"
        ]
        self.file_list
        self.return_id = return_id
        self.min_token_len = min_token_len
        self.verbose = verbose

    def __iter__(self):
        if self.verbose:
            iterator = tqdm(self.file_list)
        else:
            iterator = self.file_list

        for file_name in iterator:
            doc = self._get_doc(file_name)
            paragraphs = [
                p['par_raw_text_t']
                for p in doc['paragraphs']
                ]
            paragraph_ids = [
                p['id']
                for p in doc['paragraphs']
            ]
            for para_text, para_id in zip(paragraphs, paragraph_ids):
                tokens = preprocess(para_text, min_len=1)
                if len(tokens) > self.min_token_len:
                    if self.return_id:
                        yield tokens, para_id
                    else:
                        yield tokens

    def _get_doc(self, file_name):
        with open(file_name, "r") as f:
            line = f.readline()
            line = json.loads(line)
        return line

In [34]:
for paragraph in data["paragraphs"]:
    print(paragraph["par_raw_text_t"])

#data["paragraphs"][11]

Department of Defense INSTRUCTION 
NUMBER 1402.03 , Volume 2 April 30 , 2014 Incorporating Change 2 , April 20 , 2017 
DCMO SUBJECT : Senior Executive Service ( SES ) , Senior Level ( SL ) and Scientific and Professional ( ST ) Personnel Categories in the Do D Fourth Estate : Executive Resources Management References : See Enclosure 1 1 .PURPOSE 
a . Instruction .This instruction reissues Do D Directive ( Do DD ) 1402.3 ( Reference ( a ) ) as an instruction in accordance with the authority in Do DI 5105.82 and Deputy Secretary of Defense Memorandum ( References ( b ) and ( c ) ) .It is composed of multiple volumes , each containing its own purpose .The purpose of the overall instruction is to establish policy , assign responsibilities , delegate authorities , and provide Do D Fourth Estate entities with supplemental guidance to the policy , laws , and regulations relevant to the administration of the SES , SL , and ST categories , in accordance with Do DD 1403.1 ( Reference ( d ) ) , a