In [48]:
# Importing libraries
import os
import pandas as pd
import timeit
import pyarrow.parquet as pq
from utils import api_token
import requests
import glob
import time
import json
import string

In [49]:
# Defining the file path for importing
current_path = os.getcwd()
parent_dir = os.path.dirname(current_path)
grandparent_dir = os.path.dirname(parent_dir)
data_dir = os.path.join(grandparent_dir, 'data')
file_path = os.path.join(data_dir, 'persons_positions/people_positions_disc_summary.parquet')
print(file_path)

/Users/eshan23/eshanprashar_git_profile/judges-conflicts/data/persons_positions/people_positions_disc_summary.parquet


In [50]:
# Import and examine the data
people_fin_dis_summary = pd.read_parquet(file_path)
people_fin_dis_summary.head()

Unnamed: 0,person_id,judge_flag,position,status,disclosures_count,name_first,name_middle,name_last,political_affiliations,race
0,1,0,1,non-judge,,George,,Washington,f,w
1,2,0,1,non-judge,,John,,Adams,f,w
2,3,0,1,non-judge,,Thomas,,Jefferson,j,w
3,4,0,1,non-judge,,James,,Madison,j,w
4,5,0,1,non-judge,,James,,Monroe,j,w


In [51]:
# Create a new dataframe where disclosures_count is atleast 1 and 'non-judge' values from status are excluded
judges_with_disclosures_summary = people_fin_dis_summary[(people_fin_dis_summary['disclosures_count'] >= 1) & (people_fin_dis_summary['status'] != 'non-judge')]
judges_with_disclosures_summary.head()

Unnamed: 0,person_id,judge_flag,position,status,disclosures_count,name_first,name_middle,name_last,political_affiliations,race
43,44,1,6,judge with multiple positions,5.0,Leslie,Joyce,Abrams,d,b
44,45,1,9,judge with multiple positions,9.0,Ronnie,,Abrams,d,w
47,48,1,3,judge with multiple positions,14.0,William,Marsh,Acker,r,w
48,49,7,10,judge with multiple positions,6.0,Harold,Arnold,Ackerman,d,w
50,51,1,6,judge with multiple positions,7.0,Raymond,L.,Acosta,r,h


In [57]:
# Save this to a csv
judges_with_disclosures_summary.to_csv(os.path.join(data_dir, 'dockets/intermediate_dfs/judges_with_disclosures_summary.csv'), index=False)

In [52]:
# Now constructing an api request with a sample of 25 judges
seed = 0
sample_size = 25
sampled_judges_with_disclosures = judges_with_disclosures_summary.sample(n=sample_size, random_state=seed).reset_index(drop=True)
sampled_judges_with_disclosures.head()

Unnamed: 0,person_id,judge_flag,position,status,disclosures_count,name_first,name_middle,name_last,political_affiliations,race
0,2783,1,3,judge with multiple positions,11.0,Lee,Hyman,Rosenthal,r,w
1,3163,1,5,judge with multiple positions,16.0,Richard,C.,Tallman,d,w
2,9090,1,1,judge with 1 position,4.0,Robert,M.,Illman,,
3,2034,1,4,judge with multiple positions,16.0,Algenon,L.,Marbley,d,b
4,686,1,9,judge with multiple positions,18.0,Steven,M.,Colloton,r,w


### GPT Prompt:
I have a dataframe called "sampled_judges_with_disclosures" with a column "person_id" - I want to go through the person_id one by one and fetch data for each using the api request. For each, I want to save the data to a csv and log errors. Once requests for all authors are done, I want to collate results from all csvs into a single dataframe. Please construct functions to achieve this workflow.

### Response:
To achieve your goal of fetching data for each judge with financial disclosures from CourtListener's API and then saving the results to CSV files, you can construct a workflow consisting of the following steps:

Function to Fetch Data from the API: This function will take an author_id (equivalent to the person_id in your dataframe) and fetch the data using the API.
Function to Save Data to CSV: This function will save the data to a CSV file named after the author_id.
Main Function to Iterate Over All Judges: This function will iterate over all person_id values in your dataframe, fetch the data, save it, and handle errors.
Function to Combine All CSVs into a Single DataFrame: This function will read all CSVs and combine them into one dataframe.

In [53]:
def fetch_data(author_id, api_token):
    """
    Fetch data for a given author_id using CourtListener API with pagination.
    """
    base_url = f"https://www.courtlistener.com/api/rest/v4/search/"
    headers = {
        "Authorization": f"Token {api_token}"
    }
    
    # Start with an initial page size
    initial_page_size = 100  
    params = {
        "q": f"author_id:{author_id}",
        "page_size": initial_page_size
    }

    all_results = []
    next_page = base_url
    total_count = None

    while next_page:
        try:
            response = requests.get(next_page, headers=headers, params=params)
            response.raise_for_status()  # Raise an error for non-200 responses
            data = response.json()

            if total_count is None:
                total_count = data.get('count', 0)  # Get total count from the first response

            results = data.get('results', [])
            if results:
                all_results.extend(results)  # Append results from the current page

            next_page = data.get('next')  # Get the next page URL

            # Dynamically adjust page size based on response time or rate limits
            if len(all_results) < total_count // 2:
                params['page_size'] = min(params['page_size'] * 2, 1000)  # Increase page_size up to 1000
            else:
                params['page_size'] = max(params['page_size'] // 2, 50)  # Decrease page_size down to 50 if too high

        except requests.exceptions.RequestException as e:
            log_error(author_id, str(e))
            return None

    # Check if the total count matches the expected count
    if len(all_results) != total_count:
        log_error(author_id, f"Mismatch in total count. Expected: {total_count}, Fetched: {len(all_results)}")
    
    return all_results

In [24]:
# Function to log errors to a JSON file
def log_error(author_id, error_message, log_file='api_errors.json'):
    """
    Log errors to a JSON file.
    """
    if not os.path.exists(log_file):
        with open(log_file, 'w') as f:
            json.dump([], f)  # Initialize the file with an empty list
    
    with open(log_file, 'r+') as f:
        errors = json.load(f)
        errors.append({"author_id": author_id, "error": error_message})
        f.seek(0)
        json.dump(errors, f, indent=4)
    print(f"Error logged for author_id {author_id}: {error_message}")

In [25]:
# Function to save data to CSV
def save_to_csv(data, author_id, output_dir='api_results'):
    """
    Save the fetched data to a CSV file.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    if data:
        df = pd.json_normalize(data)  # Normalize JSON data to a DataFrame
        csv_filename = os.path.join(output_dir, f"{author_id}.csv")
        df.to_csv(csv_filename, index=False)
        print(f"Data saved to {csv_filename}")
    else:
        log_error(author_id, "No data to save.")
        print(f"No data to save for author_id {author_id}")

In [26]:
# Function to iterate over the sampled judges and fetch data
def fetch_and_save_all_judges(df, api_token, output_dir='api_results'):
    """
    Fetch and save data for all judges in the dataframe.
    """
    for i, author_id in enumerate(df['person_id']):
        print(f"Fetching data for author_id: {author_id} ({i+1}/{len(df)})")
        data = fetch_data(author_id, api_token)
        save_to_csv(data, author_id, output_dir=output_dir)
        time.sleep(1)  # Sleep to avoid hitting the API rate limit

In [27]:
# Function to combine all CSVs into a single dataframe
def combine_csvs_to_dataframe(output_dir='api_results'):
    """
    Combine all CSV files in the output directory into a single DataFrame.
    """
    all_files = glob.glob(os.path.join(output_dir, "*.csv"))
    print(f"Found {len(all_files)} files in the directory")
    li = []
    for filename in all_files:
        try:
            df = pd.read_csv(filename, index_col=None, header=0)
            li.append(df)
        except Exception as e:
            print(f"Error reading file {filename}: {e}")
    return pd.concat(li, axis=0, ignore_index=True)

In [55]:
# Example of a URL that will work
url = "https://www.courtlistener.com/api/rest/v4/search/?q=author_id:48"

In [29]:
# Fetch data for all judges and save to CSVs
fetch_and_save_all_judges(judges_with_disclosures_summary, api_token)

Fetching data for author_id: 44 (1/3345)
Error logged for author_id 44: No data to save.
No data to save for author_id 44
Fetching data for author_id: 45 (2/3345)
Error logged for author_id 45: No data to save.
No data to save for author_id 45
Fetching data for author_id: 48 (3/3345)
Data saved to api_results/48.csv
Fetching data for author_id: 49 (4/3345)
Data saved to api_results/49.csv
Fetching data for author_id: 51 (5/3345)
Data saved to api_results/51.csv
Fetching data for author_id: 57 (6/3345)
Data saved to api_results/57.csv
Fetching data for author_id: 58 (7/3345)
Data saved to api_results/58.csv
Fetching data for author_id: 60 (8/3345)
Data saved to api_results/60.csv
Fetching data for author_id: 63 (9/3345)
Data saved to api_results/63.csv
Fetching data for author_id: 64 (10/3345)
Data saved to api_results/64.csv
Fetching data for author_id: 66 (11/3345)
Data saved to api_results/66.csv
Fetching data for author_id: 69 (12/3345)
Data saved to api_results/69.csv
Fetching data

KeyboardInterrupt: 

In [30]:
# Combine all CSVs into a single DataFrame
combined_df = combine_csvs_to_dataframe()

# Display the combined DataFrame
print(combined_df.head())

Found 105 files in the directory
Error reading file api_results/1211.csv: No columns to parse from file
Error reading file api_results/234.csv: No columns to parse from file
Error reading file api_results/8781.csv: No columns to parse from file
Error reading file api_results/8888.csv: No columns to parse from file
Error reading file api_results/9629.csv: No columns to parse from file
Error reading file api_results/2145.csv: No columns to parse from file
Error reading file api_results/9207.csv: No columns to parse from file
Error reading file api_results/2955.csv: No columns to parse from file
Error reading file api_results/9433.csv: No columns to parse from file
Error reading file api_results/10144.csv: No columns to parse from file
Error reading file api_results/8884.csv: No columns to parse from file
Error reading file api_results/8893.csv: No columns to parse from file
Error reading file api_results/9391.csv: No columns to parse from file
Error reading file api_results/9082.csv: No 

In [31]:
combined_df.to_csv('combined_df.csv', index=False)

In [46]:
import ast

# Function to safely parse the opinions column
def parse_opinions(opinions):
    try:
        # Use ast.literal_eval to convert string representation of list to actual list
        parsed_opinions = ast.literal_eval(opinions)
        if isinstance(parsed_opinions, list) and len(parsed_opinions) > 0:
            return parsed_opinions[0].get('author_id', None)
    except (ValueError, SyntaxError):
        return None
    return None

# Apply the function to extract author_id
combined_df['author_id'] = combined_df['opinions'].apply(parse_opinions)

# Convert author_id to numeric, safely handling non-numeric cases
combined_df['author_id'] = pd.to_numeric(combined_df['author_id'], errors='coerce').astype('Int64')

# Count of unique author_id
unique_author_ids = combined_df['author_id'].nunique()
print(f"Total unique author_ids: {unique_author_ids}")

# Count of unique docket_id per author_id
docket_id_counts = combined_df.groupby('author_id')['docket_id'].nunique().reset_index()
print(docket_id_counts.head())


Total unique author_ids: 89
   author_id  docket_id
0         48        232
1         49        160
2         51        199
3         57         28
4         58         18


In [47]:
docket_id_counts.to_csv('docket_id_counts.csv', index=False)