In [25]:
import requests
import json
import csv
from utils import api_token

In [26]:
# Function to create a session with the API token
def make_session():
    s = requests.session()
    s.headers.update({"Authorization": f"Token {api_token}"})
    return s

In [27]:
# Function to fetch data from the API endpoint with pagination handling for a limited number of pages
def fetch_data(url, max_pages):
    session = make_session()
    all_data = []
    page_count = 0
    
    while url and page_count < max_pages:
        response = session.get(url)
        
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.text}")
            break
        
        data = response.json()
        all_data.extend(data['results']) # append data from the current page to the list

        if 'next' in data and data['next'] is not None: # next is the key for the next page
            url = data['next']
            page_count += 1
        else:
            break
    
    # testing: save data to a json file
    with open('testing_data_fetch.json', 'w') as f:
        json.dump(all_data, f)
    
    return all_data

In [31]:
# Function to process and flatten the data
def process_data(data):
    processed_data = []  # processed data is a list of elements, each element is a dictionary
    for item in data: # data is the list of "results" from the API response; each item in the list is a dictionary position 
        person = item['person']
        # Create a dictionary for the processed data
        entry = {
            'person_id': person['id'],
            'name_first': person['name_first'],
            'name_middle': person['name_middle'],
            'name_last': person['name_last'],
            'political_affiliations': person['political_affiliations'][0]['political_party'] if person['political_affiliations'] else None,  # Handle empty list case
            'race': ','.join(person['race']),
            'position_url': item['resource_uri'],
            'position_type': item['position_type'],
            'job_title': item['job_title'],
            'sector': item['sector'],
            'organization': item['organization_name'],
            'date_nominated': item['date_nominated'],
            'date_start': item['date_start'],
            'date_termination': item['date_termination']
        }

        # Add court-related fields if the key "court" is present and not None
        if 'court' in item and item['court'] is not None:
            entry.update({
                'court_resource_url': item['court']['resource_uri'],
                'court_id': item['court']['id'],
                'court_short_name': item['court']['short_name'],
                'court_full_name': item['court']['full_name'],
                'court_url': item['court']['url'],
            })
        else:
            entry.update({
                'court_resource_url': None,
                'court_id': None,
                'court_short_name': None,
                'court_full_name': None,
                'court_url': None,
            })

        processed_data.append(entry)
    return processed_data


In [33]:
# Function to save the data to a CSV file
def save_to_csv(data, filename):
    with open(filename, 'w', newline='') as csvfile: # check fields above - should be the same
        fieldnames = ['person_id', 'name_first', 'name_middle', 'name_last', 'political_affiliations', 'race', 'position_url',
                      'position_type', 'job_title', 'sector', 'organization', 'date_nominated', 'date_start', 'date_termination',
                      'court_resource_url', 'court_id', 'court_short_name', 'court_full_name', 'court_url']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for row in data:
            writer.writerow(row)

In [35]:
# Fetch and process data for the first 5 pages
base_url = "https://www.courtlistener.com/api/rest/v4/positions/" # updated the API to v4 from v3
all_data = fetch_data(base_url, max_pages=150)

# Process the fetched data
processed_data = process_data(all_data)
print("Processing done!")

Processing done!


Took 1 minute, 44 seconds to process 150 pages. 
* Total rows -> 3000 (positions)
* Unique people -> 569

In [36]:
# Save the processed data to a CSV file
save_to_csv(processed_data, 'judges_positions_v5.csv')

# Print confirmation
print(f"Data saved to judges_positions_v5.csv")

Data saved to judges_positions_v5.csv
