In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import SGDRegressor

In [2]:
df = pd.read_csv('./data/50area_dummy_processed.csv')

FileNotFoundError: [Errno 2] No such file or directory: './data/50area_dummy_processed.csv'

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243612 entries, 0 to 243611
Data columns (total 39 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   last_charge_end_time_ts              243612 non-null  int64  
 1   connection_start_time_ts             243612 non-null  int64  
 2   charging_start_time_ts               241602 non-null  float64
 3   charging_start_time_missing          243612 non-null  bool   
 4   charging_end_time_ts                 241602 non-null  float64
 5   charging_end_time_missing            243612 non-null  bool   
 6   connection_end_time_ts               243612 non-null  int64  
 7   expected_departure_time_ts           243612 non-null  int64  
 8   expected_departure_time_missing      243612 non-null  int64  
 9   idle_time_ts                         243612 non-null  int64  
 10  expected_usage_duration_ts           243612 non-null  int64  
 11  expected_usag

In [73]:
X = df.drop('requested_kwh',axis=1)
y = df['requested_kwh']

In [74]:
X_train, X_test, y_train,y_test = train_test_split(X,y, random_state=42)

In [75]:
numerical_feat = X.select_dtypes(include=["int64","float64"]).columns
categorical_feat = [
 'charging_start_time_missing',
 'charging_end_time_missing',
 'expected_departure_time_missing',
 'expected_usage_duration_missing',
 'expected_time_diff_missing',
 'actual_charging_duration_missing',
 'start_delay_duration_missing',
 'post_charge_departure_delay_missing',
 'usage_departure_time_diff_missing',
 'kwh_per_usage_time_missing',
 'evse_type',
 'supports_discharge',
 'scheduled_charge'
]

In [76]:
numerical_transformer = Pipeline([('imputer', SimpleImputer(strategy="median")),
                                  ('scaler', StandardScaler())])
numerical_transformer

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [77]:
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy="most_frequent")),
                                    ('onehot', OneHotEncoder(sparse_output=True, handle_unknown="ignore"))])
categorical_transformer

0,1,2
,steps,"[('imputer', ...), ('onehot', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [78]:
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_feat),
                                               ('cat', categorical_transformer, categorical_feat)])
preprocessor

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [79]:
linear_model = Pipeline([('preprocessor', preprocessor),
                         ('dimreduce',TruncatedSVD(n_components=100)),
                        ('regressor', SGDRegressor())])
linear_model

0,1,2
,steps,"[('preprocessor', ...), ('dimreduce', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_components,100
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,
,tol,0.0

0,1,2
,loss,'squared_error'
,penalty,'l2'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [80]:
linear_model.fit(X_train, y_train)
linear_predictions = linear_model.predict(X_test)
linear_predictions



array([19.12697622, 21.9934304 , 18.48242208, ..., 24.2824888 ,
       23.42231341, 20.33301728], shape=(60903,))

In [81]:
linear_mse = mean_squared_error(y_test, linear_predictions)
linear_mae = np.sqrt(linear_mse)
linear_r2 = r2_score(y_test, linear_predictions)
print(f"{linear_mse}, {linear_mae}, {linear_r2}")

0.27511177822481403, 0.5245109896130051, 0.9966591317294


In [82]:
joblib.dump(linear_model,'li_model.pkl')

['li_model.pkl']