# Quick Search Analysis for NDAA

## GC Document Processing to JSON required before hand

In [1]:
import json
import pandas as pd
from ipypublish import nb_setup
with open('NDAA_FY2020.json') as f:
    data = json.load(f)

## Defining Keywords for analysis
Current keywords: transform, reform

Choosing stem words to catch the largest amount of variance


In [2]:
keywords = ['transform', 'reform']
key_dict = {}

In [3]:
for par in data['paragraphs']:
    for key in keywords:
        if key in par['par_raw_text_t']:
            if key in key_dict:
                key_dict[key].append(par)
            else:
                key_dict[key] = [par]

In [4]:
all_df = []
out_dict = []
for keyword in keywords:
    for item in key_dict[keyword]:
        entity = []
        for ent in item['entities']:
            for ent_s in item['entities'][ent]:
                entity.append(ent_s)
        out_dict.append({
            'keyword': keyword,
            'filename':item['filename'],
            'page_num':item['page_num_i'],
            'par_raw_text_t':item['par_raw_text_t'],
            'entities':entity,
        })
pd = nb_setup.setup_pandas(escape_latex = False)
df = pd.DataFrame(out_dict)
df

Unnamed: 0,keyword,filename,page_num,par_raw_text_t,entities
0,transform,NDAA_FY2020.pdf,110,H . R . 6395—111 ( d )AGENCY PARTICIPATION.—Th...,"[the Food and Drug Administration, the Nationa..."
1,transform,NDAA_FY2020.pdf,353,H . R . 6395—354 and applicability of such tec...,"[the Chief Information Office, the Defense Adv..."
2,transform,NDAA_FY2020.pdf,551,H . R . 6395—552 ‘ ‘ ( vi ) transformation of ...,"[State, the Department of Defense, SEC, C the ..."
3,transform,NDAA_FY2020.pdf,657,H . R . 6395—658 ( B ) notify the appropriate ...,"[National Security Space Launch, the Space For..."
4,transform,NDAA_FY2020.pdf,1065,"H . R . 6395—1066 SEC . 4201 .RESEARCH , DEVEL...","[T&E, RAND ARROYO CENTER, MLRS, CHINOOK, SEC, ..."
5,reform,NDAA_FY2020.pdf,2,H . R . 6395—3 Sec . 212 .Disclosure requireme...,"[the Joint Artificial Intelligence Center, the..."
6,reform,NDAA_FY2020.pdf,10,H . R . 6395—11 Sec . 757 .Study on force mix ...,"[Department of Defense, Department of Veterans..."
7,reform,NDAA_FY2020.pdf,11,H . R . 6395—12 Sec . 838 .Comptroller General...,"[Congress, the Small Business Technology Trans..."
8,reform,NDAA_FY2020.pdf,23,H . R . 6395—24 Sec . 2503 .Execution of proje...,"[Department of Defense, Department of Defense ..."
9,reform,NDAA_FY2020.pdf,66,H . R . 6395—67 Sec . 212 .Disclosure requirem...,"[the Joint Artificial Intelligence Center, the..."


## Writing simple table to file
Output File is in attached CSV at

In [5]:
df.to_csv('analysis/NDAA_FY2020_QuickSearch.csv')

## Processing list of NDAA's

In [33]:
doc_list = ["NDAA_FY" + str(x) for x in range(2010,2021)]


keywords = ["Transformation", "Reform", "Defense management reform", "Reform policy", "reform guidance", 
            "reform guidelines", "Improve efficiency of business operations","Business process reengineering",
            "Busines transformation agency","Business transformation","Defense Business systems modernization",
            "Enterprise business operations reform","Enterprise business operations re-engineering",
            "Data collection", "data management", "data dissemination", "data visualization","Human resources reform",
            "Workforce Improvements","Acquisitions reform","Logistics reform","Medical reform", "health reform",
            "Information technology reform","Digital modernization","Evidence based decision making",
            "Data support for decision making","Deputy Chief Management Officer","Chief Management Officer",
            "Defense Business Council","Cost savings","Cost avoidance","Investments","Process improvements"]

key_dict = {}
data = {}
out_dict = []
for doc in doc_list:
    with open("out/" +doc + '.json') as f:
        data = json.load(f)
        
    for par in data['paragraphs']:
        for key in keywords:
            if key.lower() in par['par_raw_text_t'].lower():
                if key in key_dict:
                    key_dict[key].append(par)
                else:
                    key_dict[key] = [par]
    all_df = []

    for keyword in keywords:
        if keyword in list(key_dict.keys()):
            for item in key_dict[keyword]:
                entity = []
                for ent in item['entities']:
                    for ent_s in item['entities'][ent]:
                        entity.append(ent_s)
                out_dict.append({
                    'NDAA':item['filename'].replace("NDAA_","").replace(".pdf",""),
                    'keyword': keyword,
                    'filename':item['filename'],
                    'page_num':item['page_num_i'],
                    'par_raw_text_t':item['par_raw_text_t'],
                    'entities':entity,
                })
pd = nb_setup.setup_pandas(escape_latex = False)
df = pd.DataFrame(out_dict)
df.to_csv("analysis/FY10_FY20.csv")