In [None]:
import requests
import csv
import pandas as pd
import json
import os
import geopandas as gpd
import geodatasets
import matplotlib.pyplot as plt
import numpy as np
import re
import datetime
import statsmodels as sm
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from statsmodels.discrete.discrete_model import Poisson
import warnings
warnings.filterwarnings('ignore')

# Data Preprocessing
** NOTE: If vehicle_fuel_types_eda.ipynb (VFT_EDA) has not been run yet, please refer to that notebook first, as the datasets referenced henceforth are generated via that notebook

Generally, the datasets used are generated from API calls referenced in the VFT_EDA notebook. If you are using the downloaded versions, please refer to the further cells in regards to the preprocessing for those.

In [None]:
# the datasets have been redacted for security reasons requested by SDG&E,
# and is not uploaded in the repo. For inquiries about the dataset, please refer to the README.md 
# or contact me/the supervisor for this section.

# replace with local directory for the vehicle fuel type dataset for respective years.
# if no local datasets exist, refer to vehicle_fuel_types_eda.ipynb API or direct download for guidance.
read_columns = [2,3,4,5,6,7,8]


_2019 = pd.read_csv('TODO:2019 DATASET',usecols=read_columns)
_2020 = pd.read_csv('TODO:2020 DATASET',usecols=read_columns)
_2021 = pd.read_csv('TODO:2021 DATASET',usecols=read_columns)
_2022 = pd.read_csv('TODO:2022 DATASET',usecols=read_columns)
_2023 = pd.read_csv('TODO:2023 DATASET',usecols=read_columns)
_2024 = pd.read_csv('TODO:2024 DATASET',usecols=read_columns)



df_list = [_2019,_2020,_2021,_2022,_2023,_2024]
for yr_df in df_list:
    yr_df.columns = ['date','zipcode','modelyr','fuel','make','duty','num_vehicles']

_2019.zipcode = _2019.zipcode.replace('Other', 'OOS')

#_2019['num_vehicles'].sum(),_2020['num_vehicles'].sum() , _2021['num_vehicles'].sum(),_2022['num_vehicles'].sum(), _2023['num_vehicles'].sum(), _2024['num_vehicles'].sum(),

In [None]:
_2019.head()

In [None]:
#TESTING different preprocessing strategies

non_ev_fuels = ['Gasoline', 'Diesel and Diesel Hybrid',
       'Other', 'Flex-Fuel', 'Hybrid Gasoline', 'Natural Gas',
       'Plug-in Hybrid', 'Hydrogen Fuel Cell', 'Unk']

def fix(df):
    df['zipcode'] = df.zipcode.astype('str')
    ret = df[df.fuel == 'Battery Electric']
    ret = ret.groupby('zipcode')['num_vehicles'].sum()
    
    return ret

def fix_alt(df):
    df['zipcode'] = df.zipcode.astype('str')
    df['fuel'] = df['fuel'].replace(non_ev_fuels, 'not battery')
    ret = df
    ret = ret.groupby(['zipcode','fuel'])['num_vehicles'].sum()
    return ret


In [None]:
df24 = fix_alt(_2024)
df23 = fix_alt(_2023)
df22 = fix_alt(_2022)
df21 = fix_alt(_2021)
df20 = fix_alt(_2020)
df19 = fix_alt(_2019)
dfl = [df19,df20,df21,df22,df23,df24]

In [None]:
df = pd.concat(dfl, axis = 1)
cols = [2019,2020,2021,2022,2023,2024]
df.columns = cols

In [None]:
df.head()

## IF USING THE DOWNLOADED DATASETS, RUN THE CELLS BELOW. 
uncomment the last line in the next cell if you only plan to use the downloaded datasets.

In [None]:
# testing non-api data for any discrepancies.
# refer to VFT_EDA notebook for reference for direct download datasets. 
# !! if only using one dataset, rather than both the API call and the direct download, ignore this cell. !! 

_19 = pd.read_csv('TODO:2019 DATASET')
_20 = pd.read_csv('TODO:2020 DATASET')
_21 = pd.read_csv('TODO:2021 DATASET')
_22 = pd.read_csv('TODO:2022 DATASET')
_23 = pd.read_csv('TODO:2023 DATASET')
_24 = pd.read_csv('TODO:2024 DATASET')
df_list = [_19,_20,_21,_22,_23,_24]

for dataf in df_list:
    dataf.columns = ['date','zipcode','modelyr','fuel','make','duty','num_vehicles']

_19['zipcode'] = _19['zipcode'].replace('Other', 'OOS')

t19 = fix_alt(_19)
t20 = fix_alt(_20)
t21 = fix_alt(_21)
t22 = fix_alt(_22)
t23 = fix_alt(_23)
t24 = fix_alt(_24)

df_nonapi = pd.concat([t19,t20,t21,t22,t23,t24],axis=1)
df_nonapi.columns = cols
df_nonapi.head(10)

#df = df_nonapi

## Imputing 
the missing values are imputed based on the assumed distribution derived from the numbers of registered NON-EV categories of cars per zip code

In [None]:
# imputing based on the distribution of all other cars

df = df.reset_index()
df2 = df[df['fuel'] == 'Battery Electric']
nonbat = None
for zip in df2.zipcode.unique():
    imp = IterativeImputer(max_iter=10, random_state=0)

    both = df[df['zipcode'] == zip ]
    battery = both[both['fuel'] == 'Battery Electric']
    nonbat = both[both['fuel'] != 'Battery Electric']
    if nonbat.isnull().values.any() or len(nonbat) < 1: continue
    if battery.isnull().values.any() :
        idx = battery.index.astype(int)[0]
        res = imp.fit_transform(np.stack(np.array(both[[2019,2020,2021,2022,2023,2024]]),axis=1))
        repl = np.split(res, 2, axis=1)[0].flatten()
        df.iloc[idx, 2:] = repl

In [None]:
df2 = df[df['fuel'] == 'Battery Electric']
df2 = df2.fillna(0) # if there are no non-battery cars, fill with 0
df2 = df2.set_index(['zipcode']).drop('fuel', axis=1)
evs_cum_peryr = df2
evs_cum_peryr.head()

# Poisson distribution fitting
The number of NEW (or decrease in some cases) registrations per year (the difference from year to year) fitted to a lambda value for a poisson distribution

In [None]:
ev_regs_peryr = df2.diff(axis=1) #per year change
ev_regs_peryr = ev_regs_peryr.drop(2019,axis=1)

zip_lambs = {}
for i in range(len(ev_regs_peryr)):
    row = ev_regs_peryr.iloc[i]
    model = Poisson(endog=row, exog = [1,2,3,4,5]).fit(disp=0)
    zip_lambs[row.name] = model.params[0]

lambdas_df = pd.DataFrame.from_dict(zip_lambs,orient='index', columns=['lambdas'])
lambdas_df.index = lambdas_df.index.set_names('zip code')

In [None]:
lambdas_df.head()

In [None]:
#distribution of lambda values

plot_df = lambdas_df.dropna()
plt.hist(plot_df['lambdas'], bins=20)
plt.show()

In [None]:
#new ev registrations per year across all zips
ev_regs_peryr.sum().plot()

# Combining AFDC datasets with the vehicle fuel types dataset and visualization
The datasets used here are generated from the AFDC eda. Please run that notebook first. references to datasets are given in the cells relevant.

In [None]:
# the data used is pulled from the AFDC alternative fuel stations dataset api in !!geojson!! format. 
# The parameters are state = CA and fuel_type = ELEC.
# refer to the afdc eda and exploration notebook for guidance on how to use the AFDC dataset. 
# the string should be replaced with something that looks like: 
# "https://developer.nrel.gov/api/alt-fuel-stations/v1.geojson?api_key=APIKEY&state=CA&fuel_type=ELEC"
apikey = 'TODO'
url = f'https://developer.nrel.gov/api/alt-fuel-stations/v1.geojson?api_key={apikey}&state=CA&fuel_type=ELEC'
url_geo_evonly = url
with requests.get(url_geo_evonly) as response:
    geodata = response.json()


In [None]:
geodf = gpd.GeoDataFrame.from_features(geodata, crs = 4326)

In [None]:
# source of zip code data gathered from https://gis.data.ca.gov/datasets/CDEGIS::california-zip-codes/about
# download the geojson and replace the file path, or just use the given file link.
# also contains zip code population, used in the visualization

#fp = 'TODO'
fp = 'https://stg-arcgisazurecdataprod3.az.arcgis.com/exportfiles-39966-259/ZipCodes_-1049704744535259894.geojson?sv=2018-03-28&sr=b&sig=8vnjMFNkWmcmve84YoHGzsaI%2F2KEFu4v9KGjuXwrXWs%3D&se=2024-12-04T03%3A52%3A50Z&sp=r'
zip_geodata = gpd.GeoDataFrame.from_file(fp)
zip_geodata['zipcode'] = zip_geodata['ZIP_CODE']
zip_geodata = zip_geodata.drop('ZIP_CODE', axis = 1)

In [None]:
# the dataset used here is the saved file of the 2024 vehicle fuel type dataset, referenced in the VFT_EDA notebook as _2024 or _24.

df = _2024
df = df[df['fuel'] == 'Battery Electric']
df = df.groupby('zipcode')['num_vehicles'].sum().reset_index()
#df['zipcode'] = df['ZIP Code']
df = df.set_index('zipcode')

In [None]:
zip_geodata['zipcode']
newdf = zip_geodata.join(df, on='zipcode')

newdf1 = newdf.dropna()
newdf1 = newdf1[newdf1['POPULATION']>10]
newdf1['ev_per_capita'] = (newdf1['num_vehicles']/(newdf1['POPULATION']+1)).round(4)*1000


In [None]:
# cloropleth represents number of ev registrations in each zip code, lighter = more, darker = less
# overlay of ev charging stations
m = newdf.explore(
    column='num_vehicles',
    scheme='stdmean',
    k=6,
    popup=['PO_NAME','zipcode','num_vehicles'])

geodf.explore(
    m=m,
    marker_kwds=dict(radius=3, fill=False),
    color='Orange',
    tooltip = 'street_address',
    tooltip_kwds=dict(labels=False),
    column='fuel_type_code')

In [None]:
# cloropleth represents number of ev registrations per capita in each zip code, lighter = more, darker = less
# overlay of ev charging stations
m = newdf1.explore(
    column = 'ev_per_capita',
    scheme='stdmean',
    k=10,
)

geodf.explore(
    m=m,
    marker_kwds=dict(radius=3, fill=False),
    color='Orange',
    tooltip = 'street_address',
    tooltip_kwds=dict(labels=False),
    column='fuel_type_code')