In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.data_preprocessing import datetime_processing, userinput_processing, holiday_processing

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, precision_score, recall_score, accuracy_score

In [2]:
import os
os.chdir('..')
os.getcwd()

'/Users/varunvenkatesh/Documents/GitHub/ev_charging'

In [3]:
df = pd.read_parquet('data/sessions/caltech/').reset_index(drop=True)
df = datetime_processing(df)
df = userinput_processing(df)
df = holiday_processing(df)

In [4]:
df.head()

Unnamed: 0,_id,sessionID,stationID,spaceID,siteID,clusterID,connectionTime,disconnectTime,kWhDelivered,doneChargingTime,...,doneChargingTimeDay,userID,milesRequested,WhPerMile,minutesAvailable,modifiedAt,paymentRequired,requestedDeparture,kWhRequested,is_holiday
0,5bc9160af9af8b0dad3c0690,2_39_95_27_2018-05-01 13:23:18.374684,2-39-95-27,CA-319,2,39,2018-05-01 13:23:18+00:00,2018-05-02 01:53:41+00:00,13.41,2018-05-01 17:17:07+00:00,...,1.0,,,,,,,,,False
1,5bc9160af9af8b0dad3c0691,2_39_79_380_2018-05-01 13:54:33.599590,2-39-79-380,CA-489,2,39,2018-05-01 13:54:34+00:00,2018-05-01 23:15:35+00:00,8.618,2018-05-01 15:35:39+00:00,...,1.0,,,,,,,,,False
2,5bc9160af9af8b0dad3c0692,2_39_139_28_2018-05-01 14:46:04.441389,2-39-139-28,CA-303,2,39,2018-05-01 14:46:04+00:00,2018-05-02 00:54:07+00:00,13.969,2018-05-01 23:57:02+00:00,...,1.0,,,,,,,,,False
3,5bc9160af9af8b0dad3c0693,2_39_79_378_2018-05-01 14:56:28.908623,2-39-79-378,CA-326,2,39,2018-05-01 14:56:29+00:00,2018-05-01 21:52:45+00:00,6.914,2018-05-01 16:07:56+00:00,...,1.0,,,,,,,,,False
4,5bc9160af9af8b0dad3c0694,2_39_88_24_2018-05-01 15:14:50.443137,2-39-88-24,CA-314,2,39,2018-05-01 15:14:50+00:00,2018-05-01 22:51:30+00:00,13.785,2018-05-01 20:18:12+00:00,...,1.0,,,,,,,,,False


In [5]:
def get_results(y_test, prediction):
    cm = confusion_matrix(y_test,prediction)
    make_classification_plot(cm)
    
    results = {'tpr': cm[1, 1]/np.sum(cm[1]),
               'fpr': cm[0,1]/np.sum(cm[0]),
               'accuracy': accuracy_score(y_test, prediction),
               'precision': precision_score(y_test, prediction),
               'recall': recall_score(y_test, prediction),
        'f1':f1_score(y_test,prediction)}
    return results

In [6]:
def make_classification_plot(cm):
    disp = ConfusionMatrixDisplay(cm)
    disp = disp.plot(include_values=True, cmap='viridis', ax=None, xticks_rotation='horizontal')
    plt.grid(False)
    plt.show()

In [7]:
results = {}

In [8]:
df['connectionTime'].describe()

  df['connectionTime'].describe()


count                         31125
unique                        31081
top       2018-08-28 15:54:35+00:00
freq                              8
first     2018-05-01 13:23:18+00:00
last      2021-09-13 06:31:02+00:00
Name: connectionTime, dtype: object

In [9]:
df['spaceID'].value_counts()

CA-303      1792
CA-305      1399
CA-307      1136
11900388    1126
CA-315      1086
CA-311      1042
CA-309      1022
CA-304       994
CA-327       932
CA-313       908
CA-489       892
CA-321       862
CA-317       846
CA-325       820
CA-306       789
CA-323       759
CA-319       737
CA-314       694
CA-492       683
CA-312       665
CA-326       645
CA-320       615
CA-308       612
CA-490       587
CA-324       581
CA-322       578
CA-316       567
CA-310       561
CA-318       558
CA-494       533
CA-493       529
CA-491       506
CA-498       503
CA-495       458
CA-499       452
CA-496       452
CA-500       432
CA-497       396
CA-502       332
CA-501       330
CA-503       247
CA-212       200
CA-512       175
CA-504       149
CA-510       136
CA-506       124
CA-508       120
CA-513       108
CA-511       104
CA-507        90
CA-509        88
CA-149        76
CA-505        74
CA-213        15
CA-148         8
Name: spaceID, dtype: int64

In [27]:
tmp = df.copy()
tmp = tmp[tmp['spaceID'] == 'CA-303'].sort_index()

In [28]:
y = pd.DataFrame(index=pd.date_range('2018-05-1','2021-09-13', inclusive='both', freq='h', tz=0),columns=['is_available','sessionID'])
y['is_available'] = 1
tmp.reset_index(inplace=True)
for i in list(tmp.index):
    start_ = tmp.loc[i,'connectionTime']
    end_ = tmp.loc[i,'disconnectTime'] 
    session_ = tmp.loc[i,'sessionID']
    y.loc[start_:end_,['is_available','sessionID']] = 0, session_

In [32]:
y.head()

Unnamed: 0,is_available,sessionID
2018-05-01 00:00:00+00:00,1,
2018-05-01 01:00:00+00:00,1,
2018-05-01 02:00:00+00:00,1,
2018-05-01 03:00:00+00:00,1,
2018-05-01 04:00:00+00:00,1,
2018-05-01 05:00:00+00:00,1,
2018-05-01 06:00:00+00:00,1,
2018-05-01 07:00:00+00:00,1,
2018-05-01 08:00:00+00:00,1,
2018-05-01 09:00:00+00:00,1,


In [13]:
y.drop(columns='sessionID', inplace=True)

In [14]:
y['is_available'].value_counts()

1    20671
0     8874
Name: is_available, dtype: int64

In [53]:
caiso_df = pd.read_csv('data/caiso_dataset_5min.csv')
caiso_df['datetime'] = pd.to_datetime(caiso_df['date'] + ' ' + caiso_df['Time'], utc=True)
caiso_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354816 entries, 0 to 354815
Data columns (total 38 columns):
 #   Column                                   Non-Null Count   Dtype              
---  ------                                   --------------   -----              
 0   date                                     354816 non-null  object             
 1   Time                                     354816 non-null  object             
 2   Day ahead forecast                       354768 non-null  float64            
 3   Hour ahead forecast                      354792 non-null  float64            
 4   Current demand                           354761 non-null  float64            
 5   Net demand                               354185 non-null  float64            
 6   Net demand forecast                      8064 non-null    float64            
 7   Solar                                    354471 non-null  float64            
 8   Wind                                     354759 non-nu

In [56]:
caiso_df = caiso_df.set_index('datetime')
caiso_hourly = caiso_df.groupby(pd.Grouper(freq='1h')).mean()
caiso_hourly.index.tz_localize(None)
caiso_hourly.head()

  caiso_hourly = caiso_df.groupby(pd.Grouper(freq='1h')).mean()


Unnamed: 0_level_0,Day ahead forecast,Hour ahead forecast,Current demand,Net demand,Net demand forecast,Solar,Wind,Geothermal,Biomass,Biogas,...,Geothermal CO2,Demand,Net Demand,Day-ahead demand forecast,Day-ahead net demand forecast,Resource adequacy capacity forecast,Net resource adequacy capacity forecast,Reserve requirement,Reserve requirement forecast,Resource adequacy credits
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-05-01 00:00:00+00:00,21013.666667,21496.5,21378.5,17906.0,,0.0,3503.583333,938.083333,318.5,241.916667,...,9.083333,,,,,,,,,
2018-05-01 01:00:00+00:00,20133.083333,20673.916667,20644.583333,17251.333333,,0.0,3418.0,937.666667,314.333333,242.416667,...,10.0,,,,,,,,,
2018-05-01 02:00:00+00:00,19582.666667,20174.083333,20089.166667,16399.916667,,0.0,3703.166667,936.666667,317.166667,242.416667,...,10.0,,,,,,,,,
2018-05-01 03:00:00+00:00,19423.5,19905.083333,19822.25,15958.416667,,0.0,3865.833333,935.833333,320.5,242.083333,...,10.0,,,,,,,,,
2018-05-01 04:00:00+00:00,19879.583333,20325.833333,20321.166667,16600.583333,,0.0,3686.416667,936.0,314.25,242.25,...,10.0,,,,,,,,,


In [46]:
print(len(caiso_hourly))

24


In [57]:
X = pd.DataFrame(index=pd.date_range('2018-05-1','2021-09-13', inclusive='both', freq='h', tz=0),columns=['dow','hour','month'])
X = X.join(caiso_hourly)

In [58]:
X.head()

Unnamed: 0,dow,hour,month,Day ahead forecast,Hour ahead forecast,Current demand,Net demand,Net demand forecast,Solar,Wind,...,Geothermal CO2,Demand,Net Demand,Day-ahead demand forecast,Day-ahead net demand forecast,Resource adequacy capacity forecast,Net resource adequacy capacity forecast,Reserve requirement,Reserve requirement forecast,Resource adequacy credits
2018-05-01 00:00:00+00:00,,,,21013.666667,21496.5,21378.5,17906.0,,0.0,3503.583333,...,9.083333,,,,,,,,,
2018-05-01 01:00:00+00:00,,,,20133.083333,20673.916667,20644.583333,17251.333333,,0.0,3418.0,...,10.0,,,,,,,,,
2018-05-01 02:00:00+00:00,,,,19582.666667,20174.083333,20089.166667,16399.916667,,0.0,3703.166667,...,10.0,,,,,,,,,
2018-05-01 03:00:00+00:00,,,,19423.5,19905.083333,19822.25,15958.416667,,0.0,3865.833333,...,10.0,,,,,,,,,
2018-05-01 04:00:00+00:00,,,,19879.583333,20325.833333,20321.166667,16600.583333,,0.0,3686.416667,...,10.0,,,,,,,,,


In [59]:
X['dow'] = X.index.dayofweek
X['hour'] = X.index.hour
X['month'] = X.index.month
X['connectionTime'] = X.index
X = holiday_processing(X).drop(columns=['connectionTime'])
X.head()

Unnamed: 0,dow,hour,month,Day ahead forecast,Hour ahead forecast,Current demand,Net demand,Net demand forecast,Solar,Wind,...,Demand,Net Demand,Day-ahead demand forecast,Day-ahead net demand forecast,Resource adequacy capacity forecast,Net resource adequacy capacity forecast,Reserve requirement,Reserve requirement forecast,Resource adequacy credits,is_holiday
2018-05-01 00:00:00+00:00,1,0,5,21013.666667,21496.5,21378.5,17906.0,,0.0,3503.583333,...,,,,,,,,,,False
2018-05-01 01:00:00+00:00,1,1,5,20133.083333,20673.916667,20644.583333,17251.333333,,0.0,3418.0,...,,,,,,,,,,False
2018-05-01 02:00:00+00:00,1,2,5,19582.666667,20174.083333,20089.166667,16399.916667,,0.0,3703.166667,...,,,,,,,,,,False
2018-05-01 03:00:00+00:00,1,3,5,19423.5,19905.083333,19822.25,15958.416667,,0.0,3865.833333,...,,,,,,,,,,False
2018-05-01 04:00:00+00:00,1,4,5,19879.583333,20325.833333,20321.166667,16600.583333,,0.0,3686.416667,...,,,,,,,,,,False


In [16]:
print(f'charger CA-303 is available {np.round(y.mean()*100,3)}% of the time')

charger CA-303 is available is_available    69.964
dtype: float64% of the time


In [106]:
X.fillna(0.0, inplace=True)

In [108]:
X.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 29545 entries, 2018-05-01 00:00:00+00:00 to 2021-09-13 00:00:00+00:00
Freq: H
Data columns (total 39 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   dow                                      29545 non-null  int64  
 1   hour                                     29545 non-null  int64  
 2   month                                    29545 non-null  int64  
 3   Day ahead forecast                       29545 non-null  float64
 4   Hour ahead forecast                      29545 non-null  float64
 5   Current demand                           29545 non-null  float64
 6   Net demand                               29545 non-null  float64
 7   Net demand forecast                      29545 non-null  float64
 8   Solar                                    29545 non-null  float64
 9   Wind                                     29545 non-null  floa

In [17]:
X.shape[0] == y.shape[0]

True

# LSTM

In [109]:
# Create hold out test set
bin_numbers = pd.qcut(y['is_available'], q=10, labels=False, duplicates='drop')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, stratify=bin_numbers, random_state=42)
print(f'the training data has an average availability of {np.round(y_train.mean()*100,3)}%')

the training data has an average availability of is_available    69.652
dtype: float64%


  print(f'the training data has an average availability of {np.round(y_train.mean()*100,3)}%')


In [88]:
X_train.shape

(23636, 39)

In [84]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 23636 entries, 2018-10-07 10:00:00+00:00 to 2018-07-06 18:00:00+00:00
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_available  23636 non-null  int64 
 1   sessionID     7173 non-null   object
dtypes: int64(1), object(1)
memory usage: 554.0+ KB


In [110]:
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
tf.random.set_seed(42)

In [111]:
#scaler = MinMaxScaler(feature_range=(0,1))
look_back = 1

In [112]:
model = Sequential()
model.add(LSTM(4, input_shape=(1,39)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

In [113]:
X_train = np.reshape(X_train.values.astype(float), (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test.values.astype(float), (X_test.shape[0], 1, X_test.shape[1]))

In [99]:
[print(i.shape, i.dtype) for i in model.inputs]
[print(o.shape, o.dtype) for o in model.outputs]
[print(l.name, l.input_shape, l.dtype) for l in model.layers]

(None, 1, 39) <dtype: 'float32'>
(None, 1) <dtype: 'float32'>
lstm_2 (None, 1, 39) float32
dense_2 (None, 4) float32


[None, None]

In [93]:
X_train.shape

(23636, 1, 39)

In [114]:
y_train = y_train.drop('sessionID', axis=1) 
y_test = y_test.drop('sessionID', axis=1)


In [115]:
model.fit(X_train, y_train, epochs=10, batch_size=8, verbose=2)

Epoch 1/10
2955/2955 - 7s - loss: 0.2167 - 7s/epoch - 2ms/step
Epoch 2/10
2955/2955 - 5s - loss: 0.2115 - 5s/epoch - 2ms/step
Epoch 3/10
2955/2955 - 5s - loss: 0.2114 - 5s/epoch - 2ms/step
Epoch 4/10
2955/2955 - 5s - loss: 0.2115 - 5s/epoch - 2ms/step
Epoch 5/10
2955/2955 - 4s - loss: 0.2114 - 4s/epoch - 1ms/step
Epoch 6/10
2955/2955 - 4s - loss: 0.2115 - 4s/epoch - 1ms/step
Epoch 7/10
2955/2955 - 4s - loss: 0.2115 - 4s/epoch - 1ms/step
Epoch 8/10
2955/2955 - 4s - loss: 0.2115 - 4s/epoch - 1ms/step
Epoch 9/10
2955/2955 - 4s - loss: 0.2115 - 4s/epoch - 1ms/step
Epoch 10/10
2955/2955 - 5s - loss: 0.2114 - 5s/epoch - 2ms/step


<keras.callbacks.History at 0x7fe33512d840>

In [116]:
predictions = model.predict(X_test)



In [117]:
from sklearn.metrics import mean_squared_error

In [118]:
# MSE with CAISO
score = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {score}')

Mean Squared Error: 0.20579487568838548


In [26]:
# MSE without CAISO
score = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {score}')

Mean Squared Error: 0.19237030540507316
