In [None]:
import requests
import csv
import pandas as pd
import json
import os
import geopandas as gpd
import geodatasets
import matplotlib.pyplot as plt
import numpy as np
import re
import datetime

## Dataset Acquisition

Alternatively, if not using the api to gather data, refer to the next markdown entry for local dataset usage, and skip this section entirely. Only one or the other is required.

In [None]:
#   https://data.ca.gov/dataset/vehicle-fuel-type-count-by-zip-code
#   the resource IDS for each year can be found within the CKAN API documentation for each year. 
#   for this code to executed correctly, the REDACTED values can be replaced with the resource IDs corresponding to each year.
#   the resource IDS follow the convention of the regex expression "\w{8}\-\w{4}\-\w{4}\-\w{4}\-\w{12}"
#   ALTERNATIVELY, you can download the datasets, and ignore this api portion, 
#   instead run the commented cell referring to local file acquisition.

resource_ids = {2024:'TODO',2023:'TODO',2022:'TODO',
                2021:'TODO',2020:'TODO',2019:'TODO'}

In [None]:
url = 'https://data.ca.gov/api/3/action/datastore_search?limit=10&resource_id=<TODO>'
with requests.get(url) as response:
        data = response.json()

In [None]:
def get_dataset(resource_id, limit=10000):

    #gets all entries from given resource id

    url_template_base = 'https://data.ca.gov'
    url_template = f'https://data.ca.gov/api/3/action/datastore_search?limit={limit}&resource_id='

    d = []
    rows = 0
    next = False
    nextkey = ''
    url = url_template + resource_id

    with requests.get(url) as response:
        data = response.json()
        rows = rows + len(data['result']['records'])
        d.append(data)
        print(rows, 'rows collected')

    if data['result']['_links']['next'] != None:
        next = True
        nextkey = url_template_base + data['result']['_links']['next']

    while next:

        with requests.get(nextkey) as response:
            data = response.json()
            rows = rows + len(data['result']['records'])
            d.append(data)

        if len(data['result']['records']) < limit:
            print(rows, 'rows collected total')
            next = False
        else:
            print(rows, 'rows collected')
            nextkey = url_template_base+data['result']['_links']['next']
    return d
    


def format_dataset_list(d):
    
    #formats get_dataset output to list format

    new_d = []
    for i in range(len(d)):
        data = d[i]['result']['records']
        new_d = new_d + data
    return {'results':new_d}



def dataset_to_df(d):
    ret = pd.DataFrame(d['results'])
    ret['Vehicles'] = ret['Vehicles'].astype('int')
    ret.columns = ['year_id', 'date', 'zipcode', 'modelyr', 'fuel', 'make', 'duty',
       'num_vehicles']
    ret['date'] = pd.to_datetime(ret['date'])
    return ret



def pipeline(resource_ids,limit = 50000, path='datasets/vehicle_fuel_types/', save_output=False):
    ret_list = []
    for key in resource_ids:
        print(f'working year {key}, resource_id {resource_ids[key]}')
        dataset = get_dataset(resource_ids[key], limit)
        d_list = format_dataset_list(dataset)
        df = dataset_to_df(d_list)
        if save_output:
            df.to_csv(f'{path}{key}_vehicle_fuel_types.csv')
            print(f'{key} dataset saved to {path}{key}_vehicle_fuel_types.csv')
        ret_list.append(df)
        print(f'year {key} idx={len(ret_list)-1} in ret\n')
    return ret_list



In [None]:
# if you want to save the datasets locally, change save_output to True
# if planning to use the API rather than the direct downloads, set to True, so the files can be used with regards to
# the datamerge and distributions notebook.
veh_dfs = pipeline(resource_ids,save_output=False)

### !! Local file dataset usage !!

In [None]:
# if rather than using the api, the files are downloaded directly from the source, uncomment this code without running above code.
# replace the parameter with the local path of the relevant dataset path.

"""
column_names = ['year_id', 'date', 'zipcode', 'modelyr', 'fuel', 'make', 'duty',
       'num_vehicles']

df2019 = pd.read_csv('REPLACE WITH LOCAL 2019 DATASET PATH', dtype={'Zip Code':str, 'Model Year': str,'Vehicles': int})
df2020 = pd.read_csv('REPLACE WITH LOCAL 2020 DATASET PATH', dtype={'Zip Code':str, 'Model Year': str,'Vehicles': int})
df2021 = pd.read_csv('REPLACE WITH LOCAL 2021 DATASET PATH', dtype={'Zip Code':str, 'Model Year': str,'Vehicles': int})
df2022 = pd.read_csv('REPLACE WITH LOCAL 2022 DATASET PATH', dtype={'Zip Code':str, 'Model Year': str,'Vehicles': int})
df2023 = pd.read_csv('REPLACE WITH LOCAL 2023 DATASET PATH', dtype={'Zip Code':str, 'Model Year': str,'Vehicles': int})
df2024 = pd.read_csv('REPLACE WITH LOCAL 2024 DATASET PATH', dtype={'Zip Code':str, 'Model Year': str,'Vehicles': int})
veh_dfs = [df2019,df2020,df2021,df2022,df2023,df2024]
veh_dfs.reverse()

for df_i in veh_dfs:
    df_i.columns = column_names

column_names = ['date', 'zipcode', 'modelyr', 'fuel', 'make', 'duty',
       'num_vehicles']
for df_i in veh_dfs:
    df_i.columns = column_names
"""

In [None]:
df = pd.concat(veh_dfs)
_24 = veh_dfs[0]
_23 = veh_dfs[1]
_22 = veh_dfs[2]
_21 = veh_dfs[3]
_20 = veh_dfs[4]
_19 = veh_dfs[5]
df.date = pd.to_datetime(df.date)
df

In [None]:
fuels = df.fuel.unique()

fuels
def get_classes(df):
    fuels = df.fuel.unique()
    makes = df.make.unique()
    duties = df.duty.unique()
    return fuels,makes,duties

In [None]:
fuels,makes,duties = get_classes(_24)

In [None]:
curr_df = _24
s = curr_df.groupby('zipcode').num_vehicles.sum()
fig, ax = plt.subplots(ncols=3, figsize=(24,5))
ax[0].hist(s, bins=50)
ax[0].set_title('All Fuels')
ax[0].grid()
ax[0].set_xlim(s.min(),s.max())

ax[1].hist(s, bins=50, log=True)
ax[1].set_title('All Fuels log')
ax[1].grid()
ax[1].set_xlim(s.min(),s.max())

ax[2].hist(s, bins=100, cumulative=True)
ax[2].set_title(" cumulative")
ax[2].grid()
ax[2].set_xlim(s.min(),s.max())

#plt.savefig('figs/allfuelsfig')


Plotting the distributions of vehicle counts for zip codes for each fuel type:

In [None]:
yr = 2024
for f in fuels:
        s = curr_df[(curr_df['fuel'] == f) & (curr_df['zipcode'] != 'OOS')].groupby('zipcode').num_vehicles.sum()
        fig, ax = plt.subplots(ncols=3, figsize=(24,5))
        ax[0].hist(s, bins='auto')
        ax[0].set_title(str(yr) +' '+ f)
        ax[0].grid()
        ax[0].set_xlim(s.min(),s.max())

        ax[1].hist(s, bins='auto', log=True)
        ax[1].set_title(str(yr) +' log '+ f)
        ax[1].grid()
        ax[1].set_xlim(s.min(),s.max())

        ax[2].hist(s, bins=100, cumulative=True)
        ax[2].set_title(str(yr) +' '+f + " cumulative")
        ax[2].grid()
        ax[2].set_xlim(s.min(),s.max())

        plt.savefig("figs/"+f)
        plt.show()
    

In [None]:
_20.head()

Gathering the Total number of Battery Electric Vehicles registered in each year

In [None]:
tsdf = pd.DataFrame(df.groupby(['fuel','date']).num_vehicles.sum()).reset_index()
ts_fuels = tsdf.fuel.unique()

Plotting registration values total over the relevant years for each fuel type

In [None]:
ts_l = []
for fuel in ts_fuels:
    ts_l.append(tsdf[tsdf.fuel == fuel])
#ts_be = tsdf[tsdf.fuel == 'Battery Electric']


for fuel in ts_l:
    fig, ax = plt.subplots(ncols=1, figsize=(6,4))
    fuel_name = fuel.fuel.unique()[0]
    ax.set_ylim(fuel.num_vehicles.min()/1.2,fuel.num_vehicles.max()*1.2)
    ax.set_title(fuel_name)
    ax.plot(fuel['date'], fuel['num_vehicles'],label=fuel_name)
#ts_be.plot(ax=ax, kind='line', x='date', y='num_vehicles')

In [None]:
# This data refers to a public user dataset for the geojson data regarding the geometry of us states; refer to the readme for details.
# replace the source before using. 
url_usa = 'geojson data source'
with requests.get(url_usa) as response:
    us_geo = response.json()
    

In [None]:
geo_df = _24.groupby('zipcode').num_vehicles.sum().reset_index()
geo_df = geo_df[geo_df.zipcode != 'OOS']
geo_df

In [None]:


#import os
#os.system('jupyter nbconvert --to html vft.ipynb')

