### Notebook to fetch docket data

In [2]:
# Importing libraries
import requests
import json
import csv
import string
from utils import api_token

In [3]:
# Function to create a session with the API token
def make_session():
    s = requests.session()
    s.headers.update({"Authorization": f"Token {api_token}"})
    return s

In [4]:
# Function to fetch data from the API endpoint with pagination handling for a limited number of pages
def fetch_data(url,start_page,max_pages):
    session = make_session()
    all_data = []
    page_count = 0
    
    while url and page_count < max_pages:
        response = session.get(url)
        
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.text}")
            break
        
        data = response.json()
        all_data.extend(data['results']) # append data from the current page to the list

        if 'next' in data and data['next'] is not None: # next is the key for the next page
            url = data['next']
            page_count += 1
        else:
            break
    
    # testing: save data to a json file
    with open('testing_data_fetch.json', 'w') as f:
        json.dump(all_data, f)
    
    print(f"Total number of pages fetched: {page_count} starting from page {start_page}")
    return url, all_data

In [9]:
# Function to process and flatten the data
def process_data(data):
    processed_data = []  # processed data is a list of elements, each element is a dictionary
    for item in data: # data is the list of "results" from the API response; each item in the list is an "docket" 
        entry = {
            "resource_uri": item["resource_uri"],
            "id": item["id"],
            "court": item["court"], 
            "court_id": item["court_id"],
            "original_court_info": item["original_court_info"],
            "idb_data": item["idb_data"],
            "assigned_to": item["assigned_to"],
            "referred_to": item["referred_to"],
            "absolute_url": item["absolute_url"],
            "date_created": item["date_created"],
            "date_modified": item["date_modified"],
            "source": item["source"],
            "appeal_from_str": item["appeal_from_str"],
            "assigned_to_str": item["assigned_to_str"],
            "referred_to_str": item["referred_to_str"],
            "panel_str": item["panel_str"],
            "date_last_index": item["date_last_index"],
            "date_cert_granted": item["date_cert_granted"],
            "date_cert_denied": item["date_cert_denied"],   
            "date_argued": item["date_argued"],
            "date_reargued": item["date_reargued"],
            "date_reargument_denied": item["date_reargument_denied"],
            "date_filed": item["date_filed"],
            "date_terminated": item["date_terminated"], 
            "date_last_filing": item["date_last_filing"],
            "case_name_short": item["case_name_short"],
            "case_name": item["case_name"],
            "case_name_full": item["case_name_full"],
            "slug": item["slug"],
            "docket_number": item["docket_number"],
            "docket_number_core": item["docket_number_core"],
            "federal_dn_office_code": item["federal_dn_office_code"],
            "federal_dn_case_type": item["federal_dn_case_type"],
            "federal_dn_judge_initials_assigned": item["federal_dn_judge_initials_assigned"],
            "federal_dn_judge_initials_referred": item["federal_dn_judge_initials_referred"],
            "federal_defendant_number": item["federal_defendant_number"],
            "pacer_case_id": item["pacer_case_id"],
            "cause": item["cause"],
            "nature_of_suit": item["nature_of_suit"],
            "jury_demand": item["jury_demand"],
            "jurisdiction_type": item["jurisdiction_type"],
            "appellate_fee_status": item["appellate_fee_status"],
            "appellate_case_type_information": item["appellate_case_type_information"],
            "mdl_status": item["mdl_status"],
            "filepath_ia": item["filepath_ia"],
            "filepath_ia_json": item["filepath_ia_json"],
            "ia_upload_failure_count": item["ia_upload_failure_count"],
            "ia_needs_upload": item["ia_needs_upload"],
            "ia_date_first_change": item["ia_date_first_change"],
            "date_blocked": item["date_blocked"],
            "blocked": item["blocked"],
            "appeal_from": item["appeal_from"],
            "parent_docket": item["parent_docket"],
            "tags": item["tags"],
            "panel": item["panel"]
        }
        processed_data.append(entry)
    print(f"Total number of entries processed: {len(processed_data)}")
    return processed_data

In [11]:
# Function to save the data to a CSV file
def save_to_csv(data, filename):
    with open(filename, 'w', newline='') as csvfile: # check fields above - should be the same
        fieldnames = data[0].keys() # get the keys of the first element in the list
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for item in data:
            writer.writerow(item)
    print(f"Data saved to {filename}")

In [12]:
# Define the URL
api_name = "dockets"
base_url = f"https://www.courtlistener.com/api/rest/v4/{api_name}/" # updated the API to v4 from v3

# Fetch data for 50 pages
start_page = 1
max_pages = 50
while start_page <= max_pages: 
    url = base_url
    base_url, data = fetch_data(url,start_page, max_pages)
    processed_data = process_data(data)
    save_to_csv(processed_data, f"{api_name}_{start_page}-{start_page + max_pages}.csv")
    print(f"Fetched, processed and saved data for pages {start_page} to {start_page + max_pages}!")
    start_page += max_pages

Total number of pages fetched: 50 starting from page 1
Total number of entries processed: 1000
Data saved to educations_1-51.csv
Fetched, processed and saved data for pages 1 to 51!
