In [1]:
# etl.ipynb
import json
import urllib.request
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

In [5]:
# Part 1
# This chunk gets all the DMV registration data from 2019-2024 and condenses it down into 
# dictionary to store each data frame

# Function to fetch a chunk of data for a given resource ID
def fetch_data(resource_id, offset, chunk_size):
    url = f"https://data.ca.gov/api/3/action/datastore_search?resource_id={resource_id}&limit={chunk_size}&offset={offset}"
    with urllib.request.urlopen(url) as fileobj:
        response_dict = json.loads(fileobj.read())
        return response_dict['result']['records']

# Mapping each resource_id to the corresponding year
resource_year_map = {
    "d599c3d3-87af-4e8c-8694-9c01f49e3d93": 2024,
    "9aa5b4c5-252c-4d68-b1be-ffe19a2f1d26": 2023,
    "1856386b-a196-4e7c-be81-44174e29ad50": 2022,
    "888bbb6c-09b4-469c-82e6-1b2a47439736": 2021,
    "4254a06d-9937-4083-9441-65597dd267e8": 2020,
    "d304108a-06c1-462f-a144-981dd0109900": 2019
}

# Function to retrieve all data for a given resource ID
def retrieve_data_for_resource(resource_id, chunk_size=50000, total_records=1000000, max_workers=10):
    offsets = list(range(0, total_records, chunk_size))
    records = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = executor.map(lambda offset: fetch_data(resource_id, offset, chunk_size), offsets)
        for result in results:
            records.extend(result)

    df = pd.DataFrame(records)

    # Stardardize ZIP Code column
    if 'Zip Code' in df.columns:
        df.rename(columns={'Zip Code': 'ZIP Code'}, inplace=True)
    
    # Set all dates to the correct year based on resource_id
    year = resource_year_map[resource_id]
    df['Date'] = year  # Replace entire 'date' column with the year

    return df

# List of resource IDs for the datasets you want to fetch
resource_ids = list(resource_year_map.keys())

# Dictionary to store DataFrames for each dataset
DMV_dfs = {}

# Loop through each resource ID and retrieve data
for resource_id in resource_ids:
    print(f"Fetching data for resource ID: {resource_id}")
    DMV_dfs[resource_id] = retrieve_data_for_resource(resource_id)

# Concatenate all DataFrames into a single DataFrame
dmv_data = pd.concat(DMV_dfs.values(), ignore_index=True)

# Save the data to a CSV file (or you could use pickle for smaller file sizes)
dmv_data.to_csv('data/DMV_data.csv', index=False)
print("Data has been saved to DMV_data.csv")

Fetching data for resource ID: d599c3d3-87af-4e8c-8694-9c01f49e3d93
Fetching data for resource ID: 9aa5b4c5-252c-4d68-b1be-ffe19a2f1d26
Fetching data for resource ID: 1856386b-a196-4e7c-be81-44174e29ad50
Fetching data for resource ID: 888bbb6c-09b4-469c-82e6-1b2a47439736
Fetching data for resource ID: 4254a06d-9937-4083-9441-65597dd267e8
Fetching data for resource ID: d304108a-06c1-462f-a144-981dd0109900
Data has been saved to DMV_data.csv


In [3]:
# Part 2: Fetch AFDC Alternative Fuel Stations Data

# Define the AFDC API
api_key = 'vterAq8ej7S51meT8RE1FFo5bxFwpHPEpR4Y5GxZ'
url = f'https://developer.nrel.gov/api/alt-fuel-stations/v1.json?&api_key={api_key}'

# Request data from the AFDC API
response = requests.get(url)

# Check if request was successful
if response.status_code == 200:
    data = response.json()
    fuel_stations = data['fuel_stations']

    # Convert to pandas DataFrame
    afdc_data = pd.json_normalize(fuel_stations)

    # Save the AFDC data to a CSV file
    afdc_data.to_csv('data/fuel_stations.csv', index=False)
    print('Fuel stations data has been saved to fuel_stations.csv')
else:
    print(f'There was an error fetching AFDC data: {response.status_code}')

Fuel stations data has been saved to fuel_stations.csv
