In [1]:
import os
import json
import pandas as pd

In [128]:
def extract_name(data):
    return data['givenName'] + ' ' + data['familyName']

def extract_bio(data):
    return data['profileText']

def get_congresses(data):
    jobs = data['jobPositions']
    for job in jobs:
        try:
            yield job['congressAffiliation']['congress']['congressNumber']
        except:
            print('bad field')

def load_bio(file):
    with open(file,'r') as in_file:
        data = json.load(in_file)
        data = {k:data[k] for k in [x for x in ['usCongressBioId','givenName','middleName','familyName','birthDate','birthCirca','deathDate','deathCirca','image','profileText'] if x in data.keys()]}
        data = streamline(data)
        if 'image' in data:
            if 'contentUrl' in data['image']:
                data['image'] = data['image']['contentUrl']
            else:
                data['image'] = None
        return pd.DataFrame({k:[data[k]] for k in data})

def df_to_md(df,out_name):
    with open(out_name,'w') as out_file:
        for i in range(0,len(df['name'])):
            out_file.write(f"# {df['name'][i]}\n")
            out_file.write(" \n")
            bio_lines = pd.Series(df['bio'][i].split(';')).apply(lambda x: x.strip())
            for line in bio_lines:
                if 'College' in line or 'University' in line:
                    out_file.write(f"- **{line}**\n")
                else:
                    out_file.write(f"- {line}\n")
            out_file.write(" \n")

def streamline(data):
    for k in data:
        if type(data[k]) == list:
            if len(data[k]) > 0:
                data[k] = data[k][0]
            else:
                data[k] = None
    return data

def clean_research_record(r_record):
    if 'parentRecordLocation' in r_record['recordLocation']:
        r_record['recordLocationName'] = r_record['recordLocation']['name'] + ', ' + r_record['recordLocation']['parentRecordLocation']['name']
    else:
        r_record['recordLocationName'] = r_record['recordLocation']['name']
    r_record['recordLocationAddress'] = r_record['recordLocation']['location']['addressLocality'] + ', ' + r_record['recordLocation']['location']['addressRegion']
    r_record = {k:r_record[k] for k in [x for x in ['name','recordType','description','recordLocationName','recordLocationAddress'] if x in r_record.keys()]}
    r_record = streamline(r_record)
    r_record = {k:[r_record[k]] for k in r_record}
    return pd.DataFrame(r_record)

def load_research_records(file):
    with open(file,'r') as in_file:
        data = json.load(in_file)
    if len(data['researchRecord']) > 0:
        return pd.concat([clean_research_record(record) for record in data['researchRecord']])
    else:
        return None

In [3]:
files = ['../data/raw/' + file for file in os.listdir('../data/raw')]

In [88]:
directory = pd.concat([load_bio(file) for file in files])
directory = directory.reset_index(drop=True).fillna('')
directory = directory.drop('image',axis=1)

In [89]:
directory.to_csv('../data/clean/directory.csv',index=False)

In [127]:
with open(files[3],'r') as in_file:
    data = json.load(in_file)
data['researchRecord']


[{'name': '1878. 1 item.',
  'recordType': ['Papers'],
  'recordLocation': {'name': 'Pierpont Morgan Library',
   'location': {'addressLocality': 'New York', 'addressRegion': 'NY'}}},
 {'name': 'October 17, 1888; November 15, 1893. 2 items.',
  'recordType': ['Papers'],
  'recordLocation': {'name': 'New-York Historical Society',
   'location': {'addressLocality': 'New York', 'addressRegion': 'NY'}}},
 {'name': '1850-1911. 185 items. Correspondence, speeches, clippings, and pamphlets. Includes correspondence with his wife while he served in the Texas legislature and in the Confederate army, and material on politics and government after the Civil War.',
  'recordType': ['Papers'],
  'recordLocation': {'name': 'Dallas Historical Society',
   'location': {'addressLocality': 'Dallas', 'addressRegion': 'TX'}}},
 {'name': 'Correspondence in George Fred Williams papers, 1877-1898.',
  'recordType': ['Papers'],
  'recordLocation': {'name': 'Massachusetts Historical Society',
   'url': 'http://t

In [130]:
[load_research_records(file) for file in files]

KeyError: 'addressLocality'