In [1]:
import requests
import json
import csv
from utils import api_token

In [2]:
# Function to create a session with the API token
def make_session():
    s = requests.session()
    s.headers.update({"Authorization": f"Token {api_token}"})
    return s

In [3]:
# Function to fetch data from the API endpoint with pagination handling for a limited number of pages
def fetch_data(url,start_page,max_pages):
    session = make_session()
    all_data = []
    page_count = 0
    
    while url and page_count < max_pages:
        response = session.get(url)
        
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.text}")
            break
        
        data = response.json()
        all_data.extend(data['results']) # append data from the current page to the list

        if 'next' in data and data['next'] is not None: # next is the key for the next page
            url = data['next']
            page_count += 1
        else:
            break
    
    # testing: save data to a json file
    with open('testing_data_fetch.json', 'w') as f:
        json.dump(all_data, f)
    
    print(f"Total number of pages fetched: {page_count} starting from page {start_page}")
    return url, all_data

In [4]:
# Function to process and flatten the data
def process_data(data):
    processed_data = []  # processed data is a list of elements, each element is a dictionary
    for item in data: # data is the list of "results" from the API response; each item in the list is an "educational experience" 
        school_data = item['school'] # school is a dictionary inside the educational experience dictionary
        # Create a dictionary for the processed data
        entry = {
            'education_id': item['id'],
            'school_id': school_data['id'],
            'school_name': school_data['name'],
            'person_url': item['person'],
            'degree_level': item['degree_level'],
            'degree_detail': item['degree_detail'],
            'degree_year': item['degree_year'],
        }
        processed_data.append(entry)
    print(f"Total number of entries processed: {len(processed_data)}")
    return processed_data

In [5]:
# Function to save the data to a CSV file
def save_to_csv(data, filename):
    with open(filename, 'w', newline='') as csvfile: # check fields above - should be the same
        fieldnames = ['education_id', 'school_id', 'school_name', 'person_url', 'degree_level', 'degree_detail', 'degree_year']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for row in data:
            writer.writerow(row)
        print(f"Data saved to {filename}")

In [6]:
# Define the URL
base_url = "https://www.courtlistener.com/api/rest/v4/educations/" # updated the API to v4 from v3

# For positions, we know there are 639 pages of data
# We want to fetch data in csv files, 100 pages of positions per csv file
start_page = 1
max_pages = 100
while start_page <= 640: # 639 is the total number of pages
    url = base_url
    base_url, data = fetch_data(url,start_page, max_pages)
    processed_data = process_data(data)
    save_to_csv(processed_data, f"educations_{start_page}-{start_page + max_pages}.csv")
    print(f"Fetched, processed and saved data for pages {start_page} to {start_page + max_pages}!")
    start_page += max_pages

Total number of pages fetched: 100 starting from page 1
Total number of entries processed: 2000
Data saved to educations_1-101.csv
Fetched, processed and saved data for pages 1 to 101!
Total number of pages fetched: 100 starting from page 101
Total number of entries processed: 2000
Data saved to educations_101-201.csv
Fetched, processed and saved data for pages 101 to 201!
Error: 502, <!DOCTYPE html>
<html lang="en"><head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
  <meta charset="utf-8">
  <meta http-equiv="Content-Language" content="en">
  <meta name="language" content="en_us">
  <meta name="viewport" content="width=device-width,initial-scale=1">
  <link href="/errors_5xx/error-assets/font-awesome.css" rel="stylesheet">

  <title>Yikes, something went wrong â CourtListener.com</title>

  <link rel="stylesheet" href="/errors_5xx/error-assets/bootstrap.css" type="text/css">
  <link rel="stylesheet" href="/errors_5xx/error-assets/override.css" type="text/css"