# Data Formatting

In [205]:
import json
import os
import pandas as pd
import nltk
import re
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize, TweetTokenizer
from progressbar import ProgressBar

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/daniellesim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## CRS Files

Example instance

In [206]:
f = open('gov-report/crs/95-118.json',)
example = json.load(f)
f.close()

In [207]:
def flattenParagraph(paragraphList): 
    
    sentences = sent_tokenize(' '.join(paragraphList))
    sentences = [s.lower() for s in sentences]
    length = len(sentences)
    
    return(sentences, length)

In [208]:
def sectionInfo(file): 
    
    # initialize vectors that will later be part of formatted file
    section_names = []
    section_lengths = []
    
    # first section and content from file 
    section_names.append(file['reports']['section_title'].lower()) 
    section_lengths.append(flattenParagraph(file['reports']['paragraphs'])[1])
    content = flattenParagraph(file['reports']['paragraphs'])[0]
    
    # iterate and parse through all subsections and append to vectors 
    for d in file['reports']['subsections']:
    
        section_names.append(d['section_title'].lower())
        section_lengths.append(flattenParagraph(d['paragraphs'])[1])
        content = content + flattenParagraph(d['paragraphs'])[0]
    
    if d['subsections'] != []: 
        for s in d['subsections']: 
            section_names.append(s['section_title'].lower())
            section_lengths.append(flattenParagraph(s['paragraphs'])[1])
            content = content + flattenParagraph(s['paragraphs'])[0]
        
            if s['subsections'] != []: 
                for n in s['subsections']: 
                    section_names.append(n['section_title'].lower())
                    section_lengths.append(flattenParagraph(n['paragraphs'])[1])
                    content = content + flattenParagraph(n['paragraphs'])[0]
                    
                    if n['subsections'] != []: 
                        for t in n['subsections']: 
                            section_names.append(t['section_title'].lower())
                            section_lengths.append(flattenParagraph(t['paragraphs'])[1])
                            content = content + flattenParagraph(t['paragraphs'])[0]
                            
    return(section_names, section_lengths, content)


In [209]:
tknzr = TweetTokenizer()
def finalFormatting(file, section_names, section_lengths, content): 
    # initialize sentence 
    sentence_id = 1

    # initialize final formatted dictionary structure
    formatted = {} 
    formatted['id'] = file['id']
    formatted['inputs'] = []

    # iterate through sentences 
    for sentence in content: 
        sent = {} 
        sentence_clean = re.sub('([!?(–)—;-<->/])', r' \1 ', sentence)  # pad basic punctuation with whitespace
        sentence_clean = re.sub('.$', ' .', sentence_clean)  # pad ending period with whitespace
        sentence_clean = re.sub(' - - ', ' -- ', sentence_clean)  # cleaning up instances of -- 
        sentence_clean = re.sub(r'([0-9]) - ', r'\1-', sentence_clean)  # keeping number-word instances together
        sentence_clean = re.sub(', ', ' , ', sentence_clean)  # padding commas in sentences but not in numbers
        sentence_clean = re.sub('’', '\'', sentence_clean)  # replacing apostrophe font
        sentence_clean = re.sub(' +', ' ', sentence_clean).rstrip()  # remove double white spice and space at end
        sent['text']  = sentence_clean
        sent['tokens'] = word_tokenize(sentence_clean)
        sent['sentence_id'] = sentence_id
        sent['word_count'] = len(word_tokenize(sentence_clean))
        formatted['inputs'].append(sent)
        sentence_id += 1
        
    # add information from previous section
    formatted['section_names'] = section_names
    formatted['section_lengths'] = section_lengths
        
    return(formatted)

In [210]:
finalFormatting(example, sectionInfo(example)[0], sectionInfo(example)[1], sectionInfo(example)[2])

{'id': '95-118',
 'inputs': [{'text': 'the pension benefit guaranty corporation ( pbgc ) is a federal agency established by the employee retirement income security act of 1974 ( erisa ; p.l .',
   'tokens': ['the',
    'pension',
    'benefit',
    'guaranty',
    'corporation',
    '(',
    'pbgc',
    ')',
    'is',
    'a',
    'federal',
    'agency',
    'established',
    'by',
    'the',
    'employee',
    'retirement',
    'income',
    'security',
    'act',
    'of',
    '1974',
    '(',
    'erisa',
    ';',
    'p.l',
    '.'],
   'sentence_id': 1,
   'word_count': 27},
  {'text': '93-406 ) .',
   'tokens': ['93-406', ')', '.'],
   'sentence_id': 2,
   'word_count': 3},
  {'text': 'it was created to protect the pe nsions of participants and beneficiaries covered by private sector defined benefit ( db ) plans .',
   'tokens': ['it',
    'was',
    'created',
    'to',
    'protect',
    'the',
    'pe',
    'nsions',
    'of',
    'participants',
    'and',
    'beneficiari

In [211]:
test = '3 - fold'
re.sub(r'([0-9]) - ', r'\1-', test)

'3-fold'

### For all CRS files

In [212]:
path_to_json = 'gov-report/crs'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

In [213]:
pbar = ProgressBar()
for d in pbar(json_files): 
    
    path = path_to_json + '/' + d
    f = open(path)
    file = json.load(f)
    f.close()
    
    out = finalFormatting(file, sectionInfo(file)[0], sectionInfo(file)[1], sectionInfo(file)[2])
    
    out_path = 'formatted-data/crs/' + d
    with open(out_path, 'w') as f:
        json.dump(out, f)

100% |########################################################################|


## GAO Files

Slightly different dictionary schema, so modify function 

In [214]:
def sectionInfo(file): 
    
    # initialize vectors that will later be part of formatted file
    section_names = []
    section_lengths = []
    content = []
    
    
    for r in file['report']: 
        section_names.append(r['section_title'].lower())
        section_lengths.append(flattenParagraph(r['paragraphs'])[1])
        content = content + flattenParagraph(r['paragraphs'])[0]

        if r['subsections'] != []: 
            for s in r['subsections']: 
                section_names.append(s['section_title'].lower())
                section_lengths.append(flattenParagraph(s['paragraphs'])[1])
                content = content + flattenParagraph(s['paragraphs'])[0]

                if s['subsections'] != []: 
                    for t in s['subsections']: 
                        section_names.append(t['section_title'].lower())
                        section_lengths.append(flattenParagraph(t['paragraphs'])[1])
                        content = content + flattenParagraph(t['paragraphs'])[0]  

                        if t['subsections'] != []: 
                            for n in t['subsections']: 
                                section_names.append(n['section_title'].lower())
                                section_lengths.append(flattenParagraph(n['paragraphs'])[1])
                                content = content + flattenParagraph(n['paragraphs'])[0]  
                            
    return(section_names, section_lengths, content)


Example instance

In [215]:
f = open('gov-report/gao/AIMD-00-91.json',)
example = json.load(f)
f.close()

In [216]:
finalFormatting(example, sectionInfo(example)[0], sectionInfo(example)[1], sectionInfo(example)[2])

{'id': 'AIMD-00-91',
 'inputs': [{'text': 'i am pleased to be here today to discuss the status of the 2000 census .',
   'tokens': ['i',
    'am',
    'pleased',
    'to',
    'be',
    'here',
    'today',
    'to',
    'discuss',
    'the',
    'status',
    'of',
    'the',
    '2000',
    'census',
    '.'],
   'sentence_id': 1,
   'word_count': 16},
  {'text': "with just over 6 weeks remaining until census day , april 1 , 2000 , the decade - long process of researching , planning , testing , and evaluating procedures for the nation's largest peace - time mobilization has come to a close , and the complex and costly data collection and tabulation phase is now under way .",
   'tokens': ['with',
    'just',
    'over',
    '6',
    'weeks',
    'remaining',
    'until',
    'census',
    'day',
    ',',
    'april',
    '1',
    ',',
    '2000',
    ',',
    'the',
    'decade',
    '-',
    'long',
    'process',
    'of',
    'researching',
    ',',
    'planning',
    ',',
    't

### For all GAO files

In [217]:
path_to_json = 'gov-report/gao'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

In [218]:
pbar = ProgressBar()
for d in pbar(json_files): 
    
    path = path_to_json + '/' + d
    f = open(path)
    file = json.load(f)
    f.close()
    
    out = finalFormatting(file, sectionInfo(file)[0], sectionInfo(file)[1], sectionInfo(file)[2])
    
    out_path = 'formatted-data/gao/' + d
    with open(out_path, 'w') as f:
        json.dump(out, f)

100% |########################################################################|
