In [1]:
import warnings
warnings.simplefilter(action='ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.data_preprocessing import datetime_processing, userinput_processing, holiday_processing, create_x, create_y

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, precision_score, recall_score, accuracy_score

In [2]:
import os
os.chdir('/Users/varunvenkatesh/Documents/Github/ev_charging')
os.getcwd()

'/Users/varunvenkatesh/Documents/GitHub/ev_charging'

In [3]:
df = pd.read_parquet('data/sessions/caltech/').reset_index(drop=True)
df = datetime_processing(df)
df = userinput_processing(df)
df = holiday_processing(df)

In [4]:
df.head()

Unnamed: 0,_id,sessionID,stationID,spaceID,siteID,clusterID,connectionTime,disconnectTime,kWhDelivered,doneChargingTime,...,doneChargingTimeDay,userID,milesRequested,WhPerMile,minutesAvailable,modifiedAt,paymentRequired,requestedDeparture,kWhRequested,is_holiday
0,5bc9160af9af8b0dad3c0690,2_39_95_27_2018-05-01 13:23:18.374684,2-39-95-27,CA-319,2,39,2018-05-01 13:23:18+00:00,2018-05-02 01:53:41+00:00,13.41,2018-05-01 17:17:07+00:00,...,1.0,,,,,,,,,False
1,5bc9160af9af8b0dad3c0691,2_39_79_380_2018-05-01 13:54:33.599590,2-39-79-380,CA-489,2,39,2018-05-01 13:54:34+00:00,2018-05-01 23:15:35+00:00,8.618,2018-05-01 15:35:39+00:00,...,1.0,,,,,,,,,False
2,5bc9160af9af8b0dad3c0692,2_39_139_28_2018-05-01 14:46:04.441389,2-39-139-28,CA-303,2,39,2018-05-01 14:46:04+00:00,2018-05-02 00:54:07+00:00,13.969,2018-05-01 23:57:02+00:00,...,1.0,,,,,,,,,False
3,5bc9160af9af8b0dad3c0693,2_39_79_378_2018-05-01 14:56:28.908623,2-39-79-378,CA-326,2,39,2018-05-01 14:56:29+00:00,2018-05-01 21:52:45+00:00,6.914,2018-05-01 16:07:56+00:00,...,1.0,,,,,,,,,False
4,5bc9160af9af8b0dad3c0694,2_39_88_24_2018-05-01 15:14:50.443137,2-39-88-24,CA-314,2,39,2018-05-01 15:14:50+00:00,2018-05-01 22:51:30+00:00,13.785,2018-05-01 20:18:12+00:00,...,1.0,,,,,,,,,False


In [5]:
df.stationID.value_counts()

2-39-139-28     1792
2-39-131-30     1399
2-39-129-17     1136
2-39-81-4550    1126
2-39-89-25      1086
2-39-125-21     1042
2-39-127-19     1022
2-39-138-29      994
2-39-79-379      932
2-39-123-23      908
2-39-79-380      892
2-39-78-365      862
2-39-91-437      846
2-39-79-377      820
2-39-130-31      789
2-39-78-366      759
2-39-95-27       737
2-39-88-24       694
2-39-79-383      683
2-39-124-22      665
2-39-79-378      645
2-39-78-363      615
2-39-128-18      612
2-39-79-381      587
2-39-79-376      581
2-39-78-360      578
2-39-90-26       567
2-39-126-20      561
2-39-92-436      558
2-39-78-367      533
2-39-78-361      529
2-39-79-382      506
2-39-92-442      503
2-39-78-364      458
2-39-91-441      452
2-39-78-362      452
2-39-90-440      432
2-39-95-444      396
2-39-88-438      332
2-39-89-439      330
2-39-123-557     247
2-39-83-386      200
2-39-138-566     175
2-39-124-558     149
2-39-130-564     136
2-39-126-560     124
2-39-128-562     120
2-39-139-567 

In [6]:
def get_results(y_test, prediction):
    cm = confusion_matrix(y_test,prediction)
    make_classification_plot(cm)
    
    results = {'tpr': cm[1, 1]/np.sum(cm[1]),
               'fpr': cm[0,1]/np.sum(cm[0]),
               'accuracy': accuracy_score(y_test, prediction),
               'precision': precision_score(y_test, prediction),
               'recall': recall_score(y_test, prediction),
        'f1':f1_score(y_test,prediction)}
    return results

In [7]:
def make_classification_plot(cm):
    disp = ConfusionMatrixDisplay(cm)
    disp = disp.plot(include_values=True, cmap='viridis', ax=None, xticks_rotation='horizontal')
    plt.grid(False)
    plt.show()

In [8]:
results = {}

In [9]:
# create x and y 
start = '2018-05-01'
end = '2021-09-13'
spaceID = 'CA-303'
caiso_fp = 'data/caiso_dataset_5min.csv'
sun_fp = 'data/ct_sr_ss.csv'

In [10]:
X = create_x(start, end, caiso_fp, sun_fp)
y = create_y(df, start, end, spaceID)

In [11]:
X.head(30)

Unnamed: 0,dow,hour,month,is_sunny,is_holiday,Day ahead forecast,Hour ahead forecast,Current demand,Net demand,Solar,...,Large hydro,Batteries,Imports,Other,Biogas CO2,Biomass CO2,Natural Gas CO2,Coal CO2,Imports CO2,Geothermal CO2
2018-05-01 00:00:00+00:00,1,0,5,0,False,21013.666667,21496.5,21378.5,17906.0,0.0,...,2079.0,-10.5,8085.166667,0.0,136.083333,151.083333,1945.25,17.833333,3824.166667,9.083333
2018-05-01 01:00:00+00:00,1,1,5,0,False,20133.083333,20673.916667,20644.583333,17251.333333,0.0,...,2084.333333,8.083333,7786.083333,0.0,140.666667,153.416667,1774.666667,17.416667,3820.75,10.0
2018-05-01 02:00:00+00:00,1,2,5,0,False,19582.666667,20174.083333,20089.166667,16399.916667,0.0,...,1948.416667,-18.25,7026.916667,0.0,140.916667,154.916667,1791.166667,15.833333,3458.5,10.0
2018-05-01 03:00:00+00:00,1,3,5,0,False,19423.5,19905.083333,19822.25,15958.416667,0.0,...,1987.833333,-21.833333,6644.833333,0.0,140.916667,157.166667,1770.25,16.416667,3257.0,10.0
2018-05-01 04:00:00+00:00,1,4,5,0,False,19879.583333,20325.833333,20321.166667,16600.583333,0.0,...,2287.75,-8.083333,6972.083333,0.0,140.666667,153.5,1802.416667,17.166667,3387.916667,10.0
2018-05-01 05:00:00+00:00,1,5,5,0,False,21337.333333,21718.916667,21718.666667,18034.833333,0.0,...,2660.583333,-31.166667,7790.583333,0.0,137.166667,150.333333,1890.25,15.583333,3677.75,9.083333
2018-05-01 06:00:00+00:00,1,6,5,0,False,23149.0,23605.916667,23639.25,19678.416667,391.5,...,2771.166667,0.083333,8815.5,0.0,135.0,149.75,2035.75,17.416667,4102.166667,9.0
2018-05-01 07:00:00+00:00,1,7,5,1,False,24480.833333,25012.75,24989.333333,18528.916667,3088.416667,...,2787.333333,-29.416667,7865.666667,0.0,134.166667,151.5,1949.083333,17.166667,3744.666667,9.0
2018-05-01 08:00:00+00:00,1,8,5,1,False,24527.5,25217.166667,25220.916667,16230.166667,5764.083333,...,2472.833333,-4.25,6099.833333,0.0,136.5,156.25,1851.333333,18.5,2983.166667,9.666667
2018-05-01 09:00:00+00:00,1,9,5,1,False,24332.25,25026.916667,25010.25,15238.083333,6660.416667,...,2301.666667,13.833333,5802.583333,0.0,138.833333,159.416667,1666.333333,19.5,2852.666667,10.0


In [12]:
print(f'charger CA-303 is available {np.round(y.mean()*100,3)}% of the time')

charger CA-303 is available is_available    69.964
dtype: float64% of the time


In [13]:
#X.fillna(0.0, inplace=True)

In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 29545 entries, 2018-05-01 00:00:00+00:00 to 2021-09-13 00:00:00+00:00
Freq: H
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   dow                  29545 non-null  int64  
 1   hour                 29545 non-null  int64  
 2   month                29545 non-null  int64  
 3   is_sunny             29545 non-null  int64  
 4   is_holiday           29545 non-null  bool   
 5   Day ahead forecast   29545 non-null  float64
 6   Hour ahead forecast  29545 non-null  float64
 7   Current demand       29545 non-null  float64
 8   Net demand           29545 non-null  float64
 9   Solar                29545 non-null  float64
 10  Wind                 29545 non-null  float64
 11  Geothermal           29545 non-null  float64
 12  Biomass              29545 non-null  float64
 13  Biogas               29545 non-null  float64
 14  Small hydro          29545 non-

In [15]:
X.shape[0] == y.shape[0]

True

# LSTM

In [16]:
# Create hold out test set
bin_numbers = pd.qcut(y['is_available'], q=10, labels=False, duplicates='drop')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, stratify=bin_numbers, random_state=42)
print(f'the training data has an average availability of {np.round(y_train.mean()*100,3)}%')

the training data has an average availability of is_available    69.652
dtype: float64%


In [17]:
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
tf.random.set_seed(42)

2024-03-04 12:41:09.708488: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
#scaler = MinMaxScaler(feature_range=(0,1))
look_back = 1

In [19]:
model = Sequential()
model.add(LSTM(4, input_shape=(1, X_train.shape[1])))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

2024-03-04 12:41:14.967996: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [20]:
X_train = np.reshape(X_train.values.astype(float), (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test.values.astype(float), (X_test.shape[0], 1, X_test.shape[1]))

In [21]:
[print(i.shape, i.dtype) for i in model.inputs]
[print(o.shape, o.dtype) for o in model.outputs]
[print(l.name, l.input_shape, l.dtype) for l in model.layers]

(None, 1, 28) <dtype: 'float32'>
(None, 1) <dtype: 'float32'>
lstm (None, 1, 28) float32
dense (None, 4) float32


[None, None]

In [22]:
X_train.shape

(23636, 1, 28)

In [23]:
y_train = y_train.drop('sessionID', axis=1) 
y_test = y_test.drop('sessionID', axis=1)

In [24]:
model.fit(X_train, y_train, epochs=10, batch_size=8, verbose=2)

Epoch 1/10
2955/2955 - 6s - loss: 0.2264 - 6s/epoch - 2ms/step
Epoch 2/10
2955/2955 - 5s - loss: 0.2118 - 5s/epoch - 2ms/step
Epoch 3/10
2955/2955 - 4s - loss: 0.2117 - 4s/epoch - 1ms/step
Epoch 4/10
2955/2955 - 4s - loss: 0.2117 - 4s/epoch - 1ms/step
Epoch 5/10
2955/2955 - 4s - loss: 0.2116 - 4s/epoch - 1ms/step
Epoch 6/10
2955/2955 - 4s - loss: 0.2117 - 4s/epoch - 1ms/step
Epoch 7/10
2955/2955 - 4s - loss: 0.2117 - 4s/epoch - 1ms/step
Epoch 8/10
2955/2955 - 4s - loss: 0.2117 - 4s/epoch - 1ms/step
Epoch 9/10
2955/2955 - 4s - loss: 0.2117 - 4s/epoch - 1ms/step
Epoch 10/10
2955/2955 - 4s - loss: 0.2116 - 4s/epoch - 1ms/step


<keras.callbacks.History at 0x7fa3009fb310>

In [25]:
predictions = model.predict(X_test)



In [26]:
from sklearn.metrics import mean_squared_error

In [27]:
# MSE with CAISO
score = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {score}')

Mean Squared Error: 0.20587083968340547


# XGBoost

In [28]:
from xgboost import XGBClassifier

In [30]:
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[2]))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[2]))

In [31]:
model = XGBClassifier()
model.fit(X_train, y_train)

In [32]:
predictions = model.predict(X_test)

In [33]:
score = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {score}')

Mean Squared Error: 0.15349466914875615
