### Notebook to fetch docket data

In [1]:
# Importing libraries
import requests
import json
import csv
import string
from utils import api_token
import time

In [2]:
# Function to create a session with the API token
def make_session():
    s = requests.session()
    s.headers.update({"Authorization": f"Token {api_token}"})
    return s

In [3]:
# Function to fetch data from the API endpoint with pagination handling for a limited number of pages
def fetch_data(url, start_page, max_pages, request_count):
    session = make_session()
    all_data = []
    page_count = 0
    
    while url and page_count < max_pages:
        if request_count >= 5000:
            print(f"API limit reached. Pausing for 1 hour...")
            time.sleep(3600)  # Sleep for 1 hour
            request_count = 0  # Reset the request count after sleeping

        response = session.get(url)
        request_count += 1  # Increment the request count for each request made
        
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.text}")
            break

        data = response.json()
        all_data.extend(data['results'])

        if 'next' in data and data['next'] is not None:
            url = data['next']
            page_count += 1
        else:
            break
    
    with open(f'testing_data_fetch_page_{start_page}.json', 'w') as f:
        json.dump(all_data, f)

    print(f"Total number of pages fetched: {page_count} starting from page {start_page}")
    return url, all_data, request_count


In [4]:
# Function to process and flatten the data
def process_data(data):
    processed_data = []  
    for item in data:
        entry = {key: item.get(key, None) for key in item}  # Flatten the dictionary
        processed_data.append(entry)
    print(f"Total number of entries processed: {len(processed_data)}")
    return processed_data

In [5]:
# Function to save the data to a CSV file
def save_to_csv(data, filename):
    with open(filename, 'w', newline='') as csvfile: 
        fieldnames = data[0].keys() # get the keys of the first element in the list
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for item in data:
            writer.writerow(item)
    print(f"Data saved to {filename}")

In [6]:
# Define the URL and parameters
api_name = "dockets"
base_url = f"https://www.courtlistener.com/api/rest/v4/{api_name}/"
start_page = 105101
max_pages = 4000 # old value was 100 and then 1000

In [10]:
# Changing max pages to 5000 to get more data per csv
max_pages = 5000

# print current page range to keep track of progress
print(f"Fetching data for pages {start_page} to {start_page + max_pages}...")

# Initialize the request count to keep track of the number of requests made
request_count = 0  # Update this value based on the number of requests made so far

# Initialize the loop to fetch data for multiple pages
while start_page <= 3400000:  # Upper limit for pages is around 3.3M
    url = base_url
    base_url, data, request_count = fetch_data(url, start_page, max_pages, request_count)
    processed_data = process_data(data)
    save_to_csv(processed_data, f"{api_name}_{start_page}-{start_page + max_pages}.csv")
    print(f"Fetched, processed, and saved data for pages {start_page} to {start_page + max_pages}!")
    start_page += max_pages

Fetching data for pages 120101 to 125101...
Error: 502, <!DOCTYPE html>
<html lang="en"><head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
  <meta charset="utf-8">
  <meta http-equiv="Content-Language" content="en">
  <meta name="language" content="en_us">
  <meta name="viewport" content="width=device-width,initial-scale=1">
  <link href="/errors_5xx/error-assets/font-awesome.css" rel="stylesheet">

  <title>Yikes, something went wrong â CourtListener.com</title>

  <link rel="stylesheet" href="/errors_5xx/error-assets/bootstrap.css" type="text/css">
  <link rel="stylesheet" href="/errors_5xx/error-assets/override.css" type="text/css" media="screen, projection">
</head>

<body class="">
<div class="container round-bottom">
  <header class="row">
    <!-- Static navbar -->
    <div class="navbar navbar-default" role="navigation">
      <div class="container-fluid">
        <div class="navbar-header">
          <a class="navbar-brand" href="https://www.courtlist

IndexError: list index out of range

In [9]:
print(f'Current request count is {request_count} and start page is {start_page}')

Current request count is 4651 and start page is 120101
