In [1]:
import os
import json
import pandas as pd
import random

In [2]:
def extract_name(data):
    return data['givenName'] + ' ' + data['familyName']

def extract_bio(data):
    return data['profileText']

def get_congresses(data):
    jobs = data['jobPositions']
    for job in jobs:
        try:
            yield job['congressAffiliation']['congress']['congressNumber']
        except:
            print('bad field')

def load_bio(file):
    with open(file,'r') as in_file:
        data = json.load(in_file)
        data = {k:data[k] for k in [x for x in ['usCongressBioId','givenName','middleName','familyName','birthDate','birthCirca','deathDate','deathCirca','image','profileText'] if x in data.keys()]}
        data = streamline(data)
        if 'image' in data:
            if 'contentUrl' in data['image']:
                data['image'] = data['image']['contentUrl']
            else:
                data['image'] = None
        return pd.DataFrame({k:[data[k]] for k in data})

def df_to_md(df,out_name):
    with open(out_name,'w') as out_file:
        for i in range(0,len(df['name'])):
            out_file.write(f"# {df['name'][i]}\n")
            out_file.write(" \n")
            bio_lines = pd.Series(df['bio'][i].split(';')).apply(lambda x: x.strip())
            for line in bio_lines:
                if 'College' in line or 'University' in line:
                    out_file.write(f"- **{line}**\n")
                else:
                    out_file.write(f"- {line}\n")
            out_file.write(" \n")

def streamline(data):
    for k in data:
        if type(data[k]) == list:
            if len(data[k]) > 0:
                data[k] = data[k][0]
            else:
                data[k] = None
    return data

def clean_research_record(r_record):
    if 'parentRecordLocation' in r_record['recordLocation']:
        r_record['recordLocationName'] = r_record['recordLocation']['name'] + ', ' + r_record['recordLocation']['parentRecordLocation']['name']
    else:
        r_record['recordLocationName'] = r_record['recordLocation']['name']
    r_record['recordLocationAddress'] = flatten_dict(r_record['recordLocation']['location'])
    r_record = {k:r_record[k] for k in [x for x in ['name','recordType','description','recordLocationName','recordLocationAddress'] if x in r_record.keys()]}
    r_record = streamline(r_record)
    r_record = {k:[r_record[k]] for k in r_record}
    return pd.DataFrame(r_record)

def load_research_records(file):
    with open(file,'r') as in_file:
        data = json.load(in_file)
    if len(data['researchRecord']) > 0:
        temp = pd.concat([clean_research_record(record) for record in data['researchRecord']])
        temp['usCongressBioId'] = data['usCongressBioId']
        return temp
    else:
        return None

def flatten_dict(dict):
    return ', '.join(dict.values())

def clean_job(job):
    return pd.DataFrame({
        'name':job['job']['name'] if 'name' in job['job'] else None,
        'type':job['job']['jobType'] if 'jobType' in job['job'] else None,
        'startDate':job['startDate'] if 'startDate' in job else None,
        'startCirca':job['startCirca'] if 'startCirca' in job else None,
        'congress':job['congressAffiliation']['congress']['congressNumber'] if 'congress' in job['congressAffiliation'] else None,
        'congressType':job['congressAffiliation']['congress']['congressType'] if 'congress' in job['congressAffiliation'] else None,
        'party':', '.join([party['party']['name'] for party in job['congressAffiliation']['partyAffiliation']]) if 'partyAffiliation' in job['congressAffiliation'] else None,
        'caucus':', '.join([caucus['party']['name'] for caucus in job['congressAffiliation']['caucusAffiliation']]) if 'caucusAffiliation' in job['congressAffiliation'] else None,
        'representing':job['congressAffiliation']['represents']['regionCode'] if 'represents' in job['congressAffiliation'] else None
    },
    index=[0])

def load_jobs(file):
    with open(file,'r') as in_file:
        data = json.load(in_file)
    job_sets = [clean_job(job) for job in data['jobPositions']]
    if len(job_sets) > 0:
        temp = pd.concat(job_sets)
        temp['usCongressBioId'] = data['usCongressBioId']
    else:
        temp = None
    return temp

def clean_congress(job):
    if 'congressAffiliation' in job and 'congress' in job['congressAffiliation']:
        return pd.DataFrame(job['congressAffiliation']['congress'],index=[0])
    else:
        return None

def load_congresses(file):
    with open(file,'r') as in_file:
        data = json.load(in_file)
    congresses = [clean_congress(job) for job in data['jobPositions']]
    if len(congresses) > 0:
        temp = pd.concat(congresses)
        temp['usCongressBioId'] = data['usCongressBioId']
    else:
        temp = None
    return temp

In [3]:
files = ['../data/raw/' + file for file in os.listdir('../data/raw')]

In [4]:
directory = pd.concat([load_bio(file) for file in files])
directory = directory.reset_index(drop=True).fillna('')
directory = directory.drop('image',axis=1)
directory['fullName'] = directory['givenName'] + ' ' + directory['familyName']
directory = directory[['usCongressBioId','fullName','givenName','middleName','familyName','birthDate','birthCirca','deathDate','deathCirca','profileText']]

In [59]:
directory.to_csv('../data/clean/directory.csv',index=False)

In [10]:
research_materials = pd.concat([load_research_records(file) for file in files])
research_materials = research_materials.reset_index(drop=True).fillna('')
research_materials = research_materials.merge(directory[['usCongressBioId','fullName']])
research_materials = research_materials.rename(columns={
    'fullName':'representativeName',
    'name':'recordName',
    'description':'recordDescription'
})
research_materials = research_materials[['usCongressBioId','representativeName','recordName','recordType','recordDescription','recordLocationName','recordLocationAddress']]

In [61]:
research_materials.to_csv('../data/clean/research-materials.csv',index=False)

In [None]:
congress_jobs = pd.concat([load_jobs(file) for file in files])
congress_jobs = congress_jobs.reset_index(drop=True).fillna('')
congress_jobs = congress_jobs.merge(directory[['usCongressBioId','fullName']])
congress_jobs['congressID'] = congress_jobs['congressType'] + '_' + congress_jobs['congress'].apply(lambda x: str(x))
congress_jobs = congress_jobs.rename(columns={
    'name':'jobName',
    'type':'jobType',
    'fullName':'representativeName',
    'startDate':'jobStartDate',
    'startCirca':'jobStartCirca',
    'representing':'stateRepresented'
})
congress_jobs = congress_jobs[['usCongressBioId','representativeName','congressID','jobName','jobType','stateRepresented','party','caucus','jobStartDate','jobStartCirca']]

In [134]:
congress_jobs.to_csv('../data/clean/congressional-positions.csv',index=False)

In [155]:
congresses = pd.concat([load_congresses(file) for file in files])
congresses = congresses.reset_index(drop=True).fillna('')
congresses = congresses[['name','congressNumber','congressType','startDate','endDate']].drop_duplicates()
congresses['congressID'] = congresses['congressType'] + '_' + congresses['congressNumber'].apply(lambda x: str(x))
congresses = congresses.rename(columns={
    'name':'congressName',
    'startDate':'congressStartDate',
    'endDate':'congressEndDate'
})
congresses = congresses.sort_values('congressStartDate')
congresses = congresses[congresses['congressStartDate'] != '']
congresses = congresses.reset_index(drop=True)
congresses = congresses[['congressID','congressNumber','congressName','congressType','congressStartDate','congressEndDate']]

In [157]:
congresses.to_csv('../data/clean/congressional-sessions.csv',index=False)