In [2]:
# Importing libraries
import requests
import json
import csv
import string
from utils import api_token

In [3]:
# Function to create a session with the API token
def make_session():
    s = requests.session()
    s.headers.update({"Authorization": f"Token {api_token}"})
    return s

In [4]:
# Function to fetch data from the API endpoint with pagination handling for a limited number of pages
def fetch_data(url,start_page,max_pages):
    session = make_session()
    all_data = []
    page_count = 0
    
    while url and page_count < max_pages:
        response = session.get(url)
        
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.text}")
            break
        
        data = response.json()
        all_data.extend(data['results']) # append data from the current page to the list

        if 'next' in data and data['next'] is not None: # next is the key for the next page
            url = data['next']
            page_count += 1
        else:
            break
    
    # testing: save data to a json file
    with open('testing_data_fetch.json', 'w') as f:
        json.dump(all_data, f)
    
    print(f"Total number of pages fetched: {page_count} starting from page {start_page}")
    return url, all_data

In [14]:
# Function to process and flatten the data
def process_data(data):
    processed_data = []  # processed data is a list of elements
    for item in data:  # data is the list of "results" from the API response; each item in the list is a dictionary of financial disclosures

        # Extract general fields
        base_entry = {
            'person_url': item['person'],
            'person_id': item['person'].rstrip('/').split('/')[-1],  # last element of the person url gives ID
            'filepath': item['filepath'], # url of the pdf of the disclosure
            'disclosure_year': item['year'],
            'notes': item['addendum_content_raw'],
            'date_created': item['date_created'],
            'date_modified': item['date_modified'],
        }

        # Process agreements
        for agreement in item['agreements']:
            entry = base_entry.copy()
            entry.update({
                'type': 'agreement',
                'agreement_id': agreement.get('id', None),
                'agree_date_raw': agreement.get('date_raw', None),
                'parties_and_terms': agreement.get('parties_and_terms', None),
                'agree_redacted': agreement.get('redacted', None)
            })
            processed_data.append(entry)
        
        # Process gifts
        for gift in item['gifts']:
            entry = base_entry.copy()
            entry.update({
                'type': 'gift',
                'gift_id': gift.get('id', None),
                'gift_source': gift.get('source', None),
                'gift_description': gift.get('description', None),
                'gift_value': gift.get('value', None),
                'gift_redacted': gift.get('redacted', None)
            })
            processed_data.append(entry)

        # Process investments
        for investment in item['investments']:
            entry = base_entry.copy()
            entry.update({
                'type': 'investment',
                'investment_id': investment.get('id', None),
                'inv_description': investment.get('description', None),
                'inv_redacted': investment.get('redacted', None),
                'income_during_reporting_period_code': investment.get('income_during_reporting_period_code', None),
                'income_during_reporting_period_type': investment.get('income_during_reporting_period_type', None),
                'gross_value_code': investment.get('gross_value_code', None),
                'gross_value_method': investment.get('gross_value_method', None),
                'transaction_during_reporting_period': investment.get('transaction_during_reporting_period', None),
                'transaction_date_raw': investment.get('transaction_date_raw', None),
                'transaction_date': investment.get('transaction_date', None),
                'transaction_value_code': investment.get('transaction_value_code', None),
                'transaction_gain_code': investment.get('transaction_gain_code', None),
                'transaction_partner': investment.get('transaction_partner', None),
                'inv_has_inferred_values': investment.get('has_inferred_values', None)
            })
            processed_data.append(entry)

        # Process non-investment incomes
        for non_investment_income in item['non_investment_incomes']:
            entry = base_entry.copy()
            entry.update({
                'type': 'non_investment_income',
                'non_investment_income_id': non_investment_income.get('id', None),
                'non_inv_date_raw': non_investment_income.get('date_raw', None),
                'non_inv_source_type': non_investment_income.get('source_type', None),
                'non_inv_income_amount': non_investment_income.get('income_amount', None),
                'non_inv_redacted': non_investment_income.get('redacted', None)
            })
            processed_data.append(entry)
        
        # Process positions if they exist
        for position in item['positions']:
            entry = base_entry.copy()
            entry.update({
                'type': 'position',
                'position_id': position.get('id', None),
                'position': position.get('position', None),
                'organization_name': position.get('organization_name', None),
                'position_date_raw': position.get('date_raw', None), # this column doesn't exist - but no harm done
                'position_redacted': position.get('redacted', None)
            })
            processed_data.append(entry)

        # Process reimbursements
        for reimbursement in item['reimbursements']:
            entry = base_entry.copy()
            entry.update({
                'type': 'reimbursement',
                'reimbursement_id': reimbursement.get('id', None),
                'reimb_source': reimbursement.get('source', None),
                'reimb_date_raw': reimbursement.get('date_raw', None),
                'reimb_location': reimbursement.get('location', None),
                'purpose': reimbursement.get('purpose', None),
                'items_paid_or_provided': reimbursement.get('items_paid_or_provided', None),
                'reimb_redacted': reimbursement.get('redacted', None)
            })
            processed_data.append(entry)

        # Process spouse incomes
        for spouse_income in item['spouse_incomes']:
            entry = base_entry.copy()
            entry.update({
                'type': 'spouse_income',
                'spouse_income_id': spouse_income.get('id', None),
                'sp_inc_source_type': spouse_income.get('source_type', None),
                'sp_inc_date_raw': spouse_income.get('date_raw', None),
                'sp_inc_redacted': spouse_income.get('redacted', None)
            })
            processed_data.append(entry)
    print(f"Total number of entries processed: {len(processed_data)}")
    return processed_data

In [15]:
# Function to save the data to a CSV file
def save_to_csv(data, filename):
    with open(filename, 'w', newline='') as csvfile:
        fieldnames = [ # 8 categories of fields: general, agreements, investments, non-investment incomes, positions, reimbursements, spouse incomes 
            'person_id', 'person_url', 'filepath', 'disclosure_year', 'notes', 'date_created', 'date_modified', # general fields
            'type', 'agreement_id', 'agree_date_raw', 'parties_and_terms','agree_redacted',  # agreement fields
            'gift_id', 'gift_source', 'gift_description', 'gift_value', 'gift_redacted', # gift fields

            'investment_id','inv_description', 'inv_redacted', 'income_during_reporting_period_code', 'income_during_reporting_period_type', # investment fields
            'gross_value_code', 'gross_value_method', 'transaction_during_reporting_period', 'transaction_date_raw',
            'transaction_date', 'transaction_value_code', 'transaction_gain_code', 'transaction_partner','inv_has_inferred_values',

            'non_investment_income_id', 'non_inv_date_raw', 'non_inv_source_type', 'non_inv_income_amount', 'non_inv_redacted', # non-investment income fields
            'position_id', 'position', 'organization_name', 'date_raw_position', 'position_redacted', # position fields # date_raw_position doesn't exist in the data
            'reimbursement_id', 'reimb_source', 'date_raw_reimb', 'reimb_location', 'reimb_purpose', 'items_paid_or_provided','reimb_redacted', # reimbursement fields # date_raw_reimb doesn't exist in the data
            'spouse_income_id', 'sp_inc_source_type','sp_inc_date_raw', 'sp_inc_redacted' # spouse income fields
        ]
        
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for row in data:
            # Use dictionary comprehension to handle missing keys by setting them to None
            writer.writerow({field: row.get(field, None) for field in fieldnames})
        print(f"Data saved to {filename}")


In [17]:
# Fetch and process data for the first 5 pages
base_url = "https://www.courtlistener.com/api/rest/v4/financial-disclosures/" # updated the API to v4 from v3

# For financial disclosures, we know there are 1617 pages of data in CourtListener API
# Since one financial disclosure has a lot of data, we will fetch only 30 pages of data per csv file
start_page = 1
max_pages = 30 # maximum number of pages to fetch
while start_page <= 1620: #1620 is the maximum number of pages
    url = base_url
    base_url, data = fetch_data(url,start_page, max_pages)
    processed_data = process_data(data)
    save_to_csv(processed_data, f"fin_disc_{start_page}-{start_page + max_pages}.csv")
    print(f"Fetched, processed and saved data for pages {start_page} to {start_page + max_pages}!")
    start_page += max_pages

Total number of pages fetched: 30 starting from page 1
Total number of entries processed: 92123
Data saved to fin_disc_1-31.csv
Fetched, processed and saved data for pages 1 to 31!
Total number of pages fetched: 30 starting from page 31
Total number of entries processed: 86016
Data saved to fin_disc_31-61.csv
Fetched, processed and saved data for pages 31 to 61!
Total number of pages fetched: 30 starting from page 61
Total number of entries processed: 46963
Data saved to fin_disc_61-91.csv
Fetched, processed and saved data for pages 61 to 91!
Total number of pages fetched: 30 starting from page 91
Total number of entries processed: 44565
Data saved to fin_disc_91-121.csv
Fetched, processed and saved data for pages 91 to 121!
Total number of pages fetched: 30 starting from page 121
Total number of entries processed: 3176
Data saved to fin_disc_121-151.csv
Fetched, processed and saved data for pages 121 to 151!
Total number of pages fetched: 30 starting from page 151
Total number of entr