In [1]:
import pandas as pd
import numpy as np
from utils import reduce_memory_usage
from utils import break_datetime
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_percentage_error,mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
import joblib
import zipfile
import os
import category_encoders
from sklearn.preprocessing import StandardScaler
import joblib
import zipfile
import os


warnings.filterwarnings('ignore')

# Loading data

In [2]:
# Specify the ZIP file name
zip_filename = "../dataset/filtered.zip"

# Extract the model file from the ZIP archive
with zipfile.ZipFile(zip_filename, "r") as archive:
    # Extract the model file (named "your_model.pkl" in this example)
    archive.extract("filtered.pkl")
    
# Load the model
df = joblib.load("filtered.pkl")  # Replace with "pickle.load" if you used pickle

os.remove("filtered.pkl")


In [4]:
df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,46,0,2016-01-01 00:00:00,53.2397,0,Retail,9045,2016.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
1,74,0,2016-01-01 00:00:00,43.0013,0,Parking,387638,1997.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
2,93,0,2016-01-01 00:00:00,52.4206,0,Office,33370,1982.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
3,105,0,2016-01-01 00:00:00,23.3036,1,Education,50623,,5.0,3.8,,2.4,,1021.0,240.0,3.1
4,106,0,2016-01-01 00:00:00,0.3746,1,Education,5374,,4.0,3.8,,2.4,,1021.0,240.0,3.1


# Site 0 Measurement Correction: kBTU to kWh conversion

In [10]:
df[df['site_id']==0].meter_reading.describe()

count    561983.000000
mean        366.178624
std         422.144648
min           0.068300
25%          97.469600
50%         221.586000
75%         462.435000
max        4521.000000
Name: meter_reading, dtype: float64

In [12]:
df[df['site_id']==0].meter_reading *= 0.293014534

In [13]:
df['log_meter_reading']=np.log1p(df['meter_reading'])
df['log_square_feet']=np.log1p(df['square_feet'])
df= break_datetime(df)
df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,...,sea_level_pressure,wind_direction,wind_speed,log_meter_reading,log_square_feet,year,weekofyear,dayofweek,month,hour
0,46,0,2016-01-01,53.2397,0,Retail,9045,2016.0,,25.0,...,1019.5,0.0,0.0,3.993413,9.110078,2015,53,5,1,0
1,74,0,2016-01-01,43.0013,0,Parking,387638,1997.0,,25.0,...,1019.5,0.0,0.0,3.784219,12.86783,2015,53,5,1,0
2,93,0,2016-01-01,52.4206,0,Office,33370,1982.0,,25.0,...,1019.5,0.0,0.0,3.978196,10.415443,2015,53,5,1,0
3,105,0,2016-01-01,23.3036,1,Education,50623,,5.0,3.8,...,1021.0,240.0,3.1,3.190624,10.832181,2015,53,5,1,0
4,106,0,2016-01-01,0.3746,1,Education,5374,,4.0,3.8,...,1021.0,240.0,3.1,0.318163,8.589514,2015,53,5,1,0


# Missing Value Imputation

In [14]:
def percent_missing_val(df):

  percent_missing = (df.isnull().sum() * 100) / len(df)
  missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
  return missing_value_df

In [15]:
missing_value_df= percent_missing_val(df)


In [16]:
missing_value_df

Unnamed: 0,column_name,percent_missing
building_id,building_id,0.0
meter,meter,0.0
timestamp,timestamp,0.0
meter_reading,meter_reading,0.0
site_id,site_id,0.0
primary_use,primary_use,0.0
square_feet,square_feet,0.0
year_built,year_built,54.890488
floor_count,floor_count,74.735249
air_temperature,air_temperature,0.398257


As the year built and floor count columns have more than 50% values are missing, so we will drop these two columns.As the year built and floor count columns have more than 50% values are missing, so we will drop these two columns.

In [17]:
df.drop(['year_built', 'floor_count'], axis=1,inplace=True)


For the weather features, fill the missing values with the daily mean value of the corresponding feature in the site

In [18]:
def nan_fillers(df):
  air_temp_df=df.groupby(['site_id', 'dayofweek', 'weekofyear'])['air_temperature'].transform('mean')
  df['air_temperature'].fillna(air_temp_df, inplace=True)

  dew_temp_df=df.groupby(['site_id', 'dayofweek', 'weekofyear'])['dew_temperature'].transform('mean')
  df['dew_temperature'].fillna(dew_temp_df, inplace=True)

  cloud_df=df.groupby(['site_id', 'dayofweek', 'weekofyear'])['cloud_coverage'].transform('mean')
  df['cloud_coverage'].fillna(cloud_df, inplace=True)

  sea_level_df=df.groupby(['site_id', 'dayofweek', 'weekofyear'])['sea_level_pressure'].transform('mean')
  df['sea_level_pressure'].fillna(sea_level_df, inplace=True)

  precip_df=df.groupby(['site_id', 'dayofweek', 'weekofyear'])['precip_depth_1_hr'].transform('mean')
  df['precip_depth_1_hr'].fillna(precip_df, inplace=True)

  wind_dir_df=df.groupby(['site_id', 'dayofweek', 'weekofyear'])['wind_direction'].transform('mean')
  df['wind_direction'].fillna(wind_dir_df, inplace=True)

  wind_speed_df=df.groupby(['site_id', 'dayofweek', 'weekofyear'])['wind_speed'].transform('mean')
  df['wind_speed'].fillna(wind_speed_df, inplace=True)


  return df

In [19]:
df= nan_fillers(df)

In [20]:
df.isnull().sum()

building_id                 0
meter                       0
timestamp                   0
meter_reading               0
site_id                     0
primary_use                 0
square_feet                 0
air_temperature             0
cloud_coverage         713295
dew_temperature             0
precip_depth_1_hr     2075909
sea_level_pressure     797538
wind_direction              0
wind_speed                  0
log_meter_reading           0
log_square_feet             0
year                        0
weekofyear                  0
dayofweek                   0
month                       0
hour                        0
dtype: int64

As there were lot of slices where all the values were NAN in 3 columns namely: cloud_coverage, precip_depth_1_hr and sea_level_pressure hence we will impute the rest of the nan values with the median value.

In [21]:
df['cloud_coverage'].fillna(df['cloud_coverage'].median(), inplace=True)
df['sea_level_pressure'].fillna(df['sea_level_pressure'].median(), inplace=True)
df['precip_depth_1_hr'].fillna(df['precip_depth_1_hr'].median(), inplace=True)

# Baseline Model: Decision Tree Regressor

For baseline model, we will use site id, primary use, square feet and air temperature as features and predict the consumption in hourly 
* The building id and site id are highly correlated, so we will use one of them
* The air temperature and dew temperature are highly correlated, so we will use one of them
* square feet has positive correalation with the meter readings, so we will use as a feature


### Train - Test Split

Split data so that first 10 months will be train set and last two months will be test set

In [89]:
df=df.sort_values(by='timestamp')
X_train, X_test= train_test_split(df, test_size=0.20, shuffle=False)

In [90]:
y_train = X_train['log_meter_reading']
y_test = X_test['log_meter_reading']
X_train.drop(['meter_reading', 'log_meter_reading'], axis=1, inplace=True)
X_test.drop(['meter_reading', 'log_meter_reading'], axis=1, inplace=True)

# Label Encoding

In [91]:
label_enc= LabelEncoder()
label_enc.fit(df['primary_use'])
X_train['primary_use']= label_enc.transform(X_train['primary_use'])
X_test['primary_use']= label_enc.transform(X_test['primary_use'])


In [92]:
X_train.head()

Unnamed: 0,building_id,meter,timestamp,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,log_square_feet,year,weekofyear,dayofweek,month,hour
0,46,0,2016-01-01,0,11,9045,25.0,6.0,20.0,-0.138889,1019.5,0.0,0.0,9.110078,2015,53,5,1,0
772,1035,0,2016-01-01,12,6,14585,1.9,4.0,-1.2,0.0,1016.0,200.0,5.0,9.587817,2015,53,5,1,0
469,578,0,2016-01-01,4,4,71994,6.062753,0.957311,-4.979562,0.0,1021.197439,85.513393,2.575774,11.184352,2015,53,5,1,0
770,1033,0,2016-01-01,12,0,118489,1.9,4.0,-1.2,0.0,1016.0,200.0,5.0,11.682584,2015,53,5,1,0
486,597,0,2016-01-01,4,9,189425,6.062753,0.957311,-4.979562,0.0,1021.197439,85.513393,2.575774,12.151754,2015,53,5,1,0


### Quantitative Analysis

Primary Use

In [94]:
features = ['primary_use','hour','dayofweek','month']
DTR = DecisionTreeRegressor(max_depth=13)
DTR.fit(X_train[features],y_train)
y_pred_train = DTR.predict(X_train[features])
train_error_mse = mean_squared_error(y_train,y_pred_train)
train_error_r2 = r2_score(y_train,y_pred_train)
train_error_mape = mean_absolute_percentage_error(y_train,y_pred_train)
train_error_mae = mean_absolute_error(y_train,y_pred_train)

print("MSE for train set is: ",train_error_mse)

print("R2 for train set is: ",train_error_r2)

print("MAPE for train set is: ",train_error_mape)

print("MAE for train set is: ",train_error_mae)

y_pred_test = DTR.predict(X_test[features])
test_error_mse = mean_squared_error(y_test,y_pred_test)
test_error_r2 = r2_score(y_test,y_pred_test)
test_error_mape = mean_absolute_percentage_error(y_test,y_pred_test)
test_error_mae = mean_absolute_error(y_test,y_pred_test)

print("MSE for test set is: ",test_error_mse)

print("R2 for test set is: ",test_error_r2)

print("MAPE for test set is: ",test_error_mape)

print("MAE for test set is: ",test_error_mae)

MSE for train set is:  2.0681122133186753
R2 for train set is:  0.093115404824053
MAPE for train set is:  9.7036852687802
MAE for train set is:  1.1291201656555543
MSE for test set is:  2.090089033633079
R2 for test set is:  0.08470957293218406
MAPE for test set is:  9.17016628082704
MAE for test set is:  1.141754078384282


Square Feet

In [95]:
features = ['square_feet','hour','dayofweek','month']
DTR = DecisionTreeRegressor(max_depth=13)
DTR.fit(X_train[features],y_train)
y_pred_train = DTR.predict(X_train[features])
train_error_mse = mean_squared_error(y_train,y_pred_train)
train_error_r2 = r2_score(y_train,y_pred_train)
train_error_mape = mean_absolute_percentage_error(y_train,y_pred_train)
train_error_mae = mean_absolute_error(y_train,y_pred_train)

print("MSE for train set is: ",train_error_mse)

print("R2 for train set is: ",train_error_r2)

print("MAPE for train set is: ",train_error_mape)

print("MAE for train set is: ",train_error_mae)

y_pred_test = DTR.predict(X_test[features])
test_error_mse = mean_squared_error(y_test,y_pred_test)
test_error_r2 = r2_score(y_test,y_pred_test)
test_error_mape = mean_absolute_percentage_error(y_test,y_pred_test)
test_error_mae = mean_absolute_error(y_test,y_pred_test)

print("MSE for test set is: ",test_error_mse)

print("R2 for test set is: ",test_error_r2)

print("MAPE for test set is: ",test_error_mape)

print("MAE for test set is: ",test_error_mae)

MSE for train set is:  0.4456045035158885
R2 for train set is:  0.80459867836131
MAPE for train set is:  0.3087783048820495
MAE for train set is:  0.46445148192153735
MSE for test set is:  0.5256835386158965
R2 for test set is:  0.7697930074653784
MAPE for test set is:  0.31998320272163794
MAE for test set is:  0.507152261768257


Air Temperature

In [97]:
features = ['air_temperature','hour','dayofweek','month']
DTR = DecisionTreeRegressor(max_depth=13)
DTR.fit(X_train[features],y_train)
y_pred_train = DTR.predict(X_train[features])
train_error_mse = mean_squared_error(y_train,y_pred_train)
train_error_r2 = r2_score(y_train,y_pred_train)
train_error_mape = mean_absolute_percentage_error(y_train,y_pred_train)
train_error_mae = mean_absolute_error(y_train,y_pred_train)

print("MSE for train set is: ",train_error_mse)

print("R2 for train set is: ",train_error_r2)

print("MAPE for train set is: ",train_error_mape)

print("MAE for train set is: ",train_error_mae)

y_pred_test = DTR.predict(X_test[features])
test_error_mse = mean_squared_error(y_test,y_pred_test)
test_error_r2 = r2_score(y_test,y_pred_test)
test_error_mape = mean_absolute_percentage_error(y_test,y_pred_test)
test_error_mae = mean_absolute_error(y_test,y_pred_test)

print("MSE for test set is: ",test_error_mse)

print("R2 for test set is: ",test_error_r2)

print("MAPE for test set is: ",test_error_mape)

print("MAE for test set is: ",test_error_mae)

MSE for train set is:  2.126411958684849
R2 for train set is:  0.06755047626999677
MAPE for train set is:  7.765744770876809
MAE for train set is:  1.1576322931248937
MSE for test set is:  2.418241320711108
R2 for test set is:  -0.05899466270070719
MAPE for test set is:  8.162154518368741
MAE for test set is:  1.234894445059418


Site ID

In [98]:
features = ['site_id','hour','dayofweek','month']
DTR = DecisionTreeRegressor(max_depth=13)
DTR.fit(X_train[features],y_train)
y_pred_train = DTR.predict(X_train[features])
train_error_mse = mean_squared_error(y_train,y_pred_train)
train_error_r2 = r2_score(y_train,y_pred_train)
train_error_mape = mean_absolute_percentage_error(y_train,y_pred_train)
train_error_mae = mean_absolute_error(y_train,y_pred_train)

print("MSE for train set is: ",train_error_mse)

print("R2 for train set is: ",train_error_r2)

print("MAPE for train set is: ",train_error_mape)

print("MAE for train set is: ",train_error_mae)

y_pred_test = DTR.predict(X_test[features])
test_error_mse = mean_squared_error(y_test,y_pred_test)
test_error_r2 = r2_score(y_test,y_pred_test)
test_error_mape = mean_absolute_percentage_error(y_test,y_pred_test)
test_error_mae = mean_absolute_error(y_test,y_pred_test)

print("MSE for test set is: ",test_error_mse)

print("R2 for test set is: ",test_error_r2)

print("MAPE for test set is: ",test_error_mape)

print("MAE for test set is: ",test_error_mae)

MSE for train set is:  1.7537193990349464
R2 for train set is:  0.230979297446398
MAPE for train set is:  5.899930234278534
MAE for train set is:  1.0485705299068777
MSE for test set is:  1.7574860882500967
R2 for test set is:  0.23036283794857115
MAPE for test set is:  5.510564623177267
MAE for test set is:  1.0489849745361002


Square Feet & Site ID 

In [100]:
features = ['square_feet','site_id','hour','dayofweek','month']
DTR = DecisionTreeRegressor(max_depth=13)
DTR.fit(X_train[features],y_train)
y_pred_train = DTR.predict(X_train[features])
train_error_mse = mean_squared_error(y_train,y_pred_train)
train_error_r2 = r2_score(y_train,y_pred_train)
train_error_mape = mean_absolute_percentage_error(y_train,y_pred_train)
train_error_mae = mean_absolute_error(y_train,y_pred_train)

print("MSE for train set is: ",train_error_mse)

print("R2 for train set is: ",train_error_r2)

print("MAPE for train set is: ",train_error_mape)

print("MAE for train set is: ",train_error_mae)

y_pred_test = DTR.predict(X_test[features])
test_error_mse = mean_squared_error(y_test,y_pred_test)
test_error_r2 = r2_score(y_test,y_pred_test)
test_error_mape = mean_absolute_percentage_error(y_test,y_pred_test)
test_error_mae = mean_absolute_error(y_test,y_pred_test)

print("MSE for test set is: ",test_error_mse)

print("R2 for test set is: ",test_error_r2)

print("MAPE for test set is: ",test_error_mape)

print("MAE for test set is: ",test_error_mae)

MSE for train set is:  0.2985469972626479
R2 for train set is:  0.8690846313802937
MAPE for train set is:  0.2019478356209264
MAE for train set is:  0.3688376915128472
MSE for test set is:  0.38202380734606617
R2 for test set is:  0.8327043833304767
MAPE for test set is:  0.25121241944673783
MAE for test set is:  0.41096812430217133


Square Feet & Site Id & Primary Use

In [101]:
features = ['square_feet','site_id','primary_use','hour','dayofweek','month']
DTR = DecisionTreeRegressor(max_depth=13)
DTR.fit(X_train[features],y_train)
y_pred_train = DTR.predict(X_train[features])
train_error_mse = mean_squared_error(y_train,y_pred_train)
train_error_r2 = r2_score(y_train,y_pred_train)
train_error_mape = mean_absolute_percentage_error(y_train,y_pred_train)
train_error_mae = mean_absolute_error(y_train,y_pred_train)

print("MSE for train set is: ",train_error_mse)

print("R2 for train set is: ",train_error_r2)

print("MAPE for train set is: ",train_error_mape)

print("MAE for train set is: ",train_error_mae)

y_pred_test = DTR.predict(X_test[features])
test_error_mse = mean_squared_error(y_test,y_pred_test)
test_error_r2 = r2_score(y_test,y_pred_test)
test_error_mape = mean_absolute_percentage_error(y_test,y_pred_test)
test_error_mae = mean_absolute_error(y_test,y_pred_test)

print("MSE for test set is: ",test_error_mse)

print("R2 for test set is: ",test_error_r2)

print("MAPE for test set is: ",test_error_mape)

print("MAE for test set is: ",test_error_mae)

MSE for train set is:  0.25318957850883833
R2 for train set is:  0.8889742408898123
MAPE for train set is:  0.20222123052468635
MAE for train set is:  0.33944934813318417
MSE for test set is:  0.3346344161472899
R2 for test set is:  0.8534571146308345
MAPE for test set is:  0.23958583282349724
MAE for test set is:  0.3849720209524162


Square Feet & Site Id & Air Temperature

In [102]:
features = ['square_feet','site_id','air_temperature','hour','dayofweek','month']
DTR = DecisionTreeRegressor(max_depth=13)
DTR.fit(X_train[features],y_train)
y_pred_train = DTR.predict(X_train[features])
train_error_mse = mean_squared_error(y_train,y_pred_train)
train_error_r2 = r2_score(y_train,y_pred_train)
train_error_mape = mean_absolute_percentage_error(y_train,y_pred_train)
train_error_mae = mean_absolute_error(y_train,y_pred_train)

print("MSE for train set is: ",train_error_mse)

print("R2 for train set is: ",train_error_r2)

print("MAPE for train set is: ",train_error_mape)

print("MAE for train set is: ",train_error_mae)

y_pred_test = DTR.predict(X_test[features])
test_error_mse = mean_squared_error(y_test,y_pred_test)
test_error_r2 = r2_score(y_test,y_pred_test)
test_error_mape = mean_absolute_percentage_error(y_test,y_pred_test)
test_error_mae = mean_absolute_error(y_test,y_pred_test)

print("MSE for test set is: ",test_error_mse)

print("R2 for test set is: ",test_error_r2)

print("MAPE for test set is: ",test_error_mape)

print("MAE for test set is: ",test_error_mae)

MSE for train set is:  0.2985523466919041
R2 for train set is:  0.8690822856105844
MAPE for train set is:  0.20015557390020855
MAE for train set is:  0.3690356872275931
MSE for test set is:  0.3789463309941436
R2 for test set is:  0.8340520697682865
MAPE for test set is:  0.2502674996951189
MAE for test set is:  0.4061110598166549


All Features

In [103]:
features = ['site_id','air_temperature','log_square_feet','primary_use','hour','dayofweek','month']
DTR = DecisionTreeRegressor(max_depth=13)
DTR.fit(X_train[features],y_train)
y_pred_train = DTR.predict(X_train[features])
train_error_mse = mean_squared_error(y_train,y_pred_train)
train_error_r2 = r2_score(y_train,y_pred_train)
train_error_mape = mean_absolute_percentage_error(y_train,y_pred_train)
train_error_mae = mean_absolute_error(y_train,y_pred_train)

print("MSE for train set is: ",train_error_mse)

print("R2 for train set is: ",train_error_r2)

print("MAPE for train set is: ",train_error_mape)

print("MAE for train set is: ",train_error_mae)

y_pred_test = DTR.predict(X_test[features])
test_error_mse = mean_squared_error(y_test,y_pred_test)
test_error_r2 = r2_score(y_test,y_pred_test)
test_error_mape = mean_absolute_percentage_error(y_test,y_pred_test)
test_error_mae = mean_absolute_error(y_test,y_pred_test)

print("MSE for test set is: ",test_error_mse)

print("R2 for test set is: ",test_error_r2)

print("MAPE for test set is: ",test_error_mape)

print("MAE for test set is: ",test_error_mae)


MSE for train set is:  0.25261269834490435
R2 for train set is:  0.8892272076923702
MAPE for train set is:  0.1994748450863307
MAE for train set is:  0.3386288638111573
MSE for test set is:  0.3295261831746415
R2 for test set is:  0.8556941086841309
MAPE for test set is:  0.23986594587757273
MAE for test set is:  0.37964829841665626


# Baseline Model: Linear Regression

### Train - Test Split

In [82]:
df=df.sort_values(by='timestamp')
X_train, X_test= train_test_split(df, test_size=0.20, shuffle=False)
y_train = X_train['log_meter_reading']
y_test = X_test['log_meter_reading']
X_train.drop(['meter_reading', 'log_meter_reading'], axis=1, inplace=True)
X_test.drop(['meter_reading', 'log_meter_reading'], axis=1, inplace=True)
X_train = X_train[features]
X_test = X_test[features]

### One Hot Encoding

In [83]:
categorical_features = ["site_id","primary_use"]

In [84]:
X_train = pd.get_dummies(X_train, columns=categorical_features)
X_test = pd.get_dummies(X_test, columns=categorical_features )

In [85]:
X_train

Unnamed: 0,air_temperature,log_square_feet,hour,dayofweek,month,site_id_0,site_id_1,site_id_2,site_id_3,site_id_4,...,primary_use_Office,primary_use_Other,primary_use_Parking,primary_use_Public services,primary_use_Religious worship,primary_use_Retail,primary_use_Services,primary_use_Technology/science,primary_use_Utility,primary_use_Warehouse/storage
0,25.0,9.110078,0,5,1,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,25.0,10.415443,0,5,1,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
883,-8.3,10.521588,0,5,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,3.8,8.589514,0,5,1,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
892,-8.3,11.063070,0,5,1,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9224569,19.4,10.066201,20,6,10,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
9224584,19.4,10.980774,20,6,10,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
9224589,19.4,12.631801,20,6,10,False,False,False,False,True,...,False,False,True,False,False,False,False,False,False,False
9224595,8.0,11.599763,20,6,10,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [86]:
LR = LinearRegression()
LR.fit(X_train,y_train)

In [87]:
y_pred_train = LR.predict(X_train)
train_error_mse = mean_squared_error(y_train,y_pred_train)
train_error_r2 = r2_score(y_train,y_pred_train)
train_error_mape = mean_absolute_percentage_error(y_train,y_pred_train)
train_error_mae = mean_absolute_error(y_train,y_pred_train)

print("MSE for train set is: ",train_error_mse)

print("R2 for train set is: ",train_error_r2)

print("MAPE for train set is: ",train_error_mape)

print("MAE for train set is: ",train_error_mae)

MSE for train set is:  0.7785738116798853
R2 for train set is:  0.6585872297615933
MAPE for train set is:  6.131172329559948
MAE for train set is:  0.6488169386352591


In [88]:
y_pred_test = LR.predict(X_test)
test_error_mse = mean_squared_error(y_test,y_pred_test)
test_error_r2 = r2_score(y_test,y_pred_test)
test_error_mape = mean_absolute_percentage_error(y_test,y_pred_test)
test_error_mae = mean_absolute_error(y_test,y_pred_test)

print("MSE for test set is: ",test_error_mse)

print("R2 for test set is: ",test_error_r2)

print("MAPE for test set is: ",test_error_mape)

print("MAE for test set is: ",test_error_mae)

MSE for test set is:  0.8133429122538611
R2 for test set is:  0.643828122612006
MAPE for test set is:  5.8504698594495075
MAE for test set is:  0.659385384383699


# Save the models

 Save Decision Tree Regressor

In [65]:
# Specify the zip file name
zip_filename = "../models/DTR_v0.zip"

# Create a ZIP file and add the model object to it
with zipfile.ZipFile(zip_filename, "w", zipfile.ZIP_DEFLATED) as archive:
    # Save the model to a temporary file
    temp_model_filename = "temp_model.pkl"
    joblib.dump(DTR, temp_model_filename)
    
    # Add the temporary model file to the ZIP archive
    archive.write(temp_model_filename, arcname="DTR_v0.pkl")

# Remove the temporary model file
os.remove(temp_model_filename)


Save Linear Regression

In [66]:
# Specify the zip file name
zip_filename = "../models/LR_v0.zip"

# Create a ZIP file and add the model object to it
with zipfile.ZipFile(zip_filename, "w", zipfile.ZIP_DEFLATED) as archive:
    # Save the model to a temporary file
    temp_model_filename = "temp_model.pkl"
    joblib.dump(DTR, temp_model_filename)
    
    # Add the temporary model file to the ZIP archive
    archive.write(temp_model_filename, arcname="LR_v0.pkl")

# Remove the temporary model file
os.remove(temp_model_filename)

Reuse the model by loading

In [67]:
# Specify the ZIP file name
zip_filename = "../models/LR_v0.zip"

# Extract the model file from the ZIP archive
with zipfile.ZipFile(zip_filename, "r") as archive:
    # Extract the model file (named "your_model.pkl" in this example)
    archive.extract("LR_v0.pkl")
    
# Load the model
model = joblib.load("LR_v0.pkl")  # Replace with "pickle.load" if you used pickle

os.remove("LR_v0.pkl")

# You can now use the "model" for predictions or other tasks
y_pred_test = LR.predict(X_test)
test_error = mean_squared_error(y_test,y_pred_test)
print("MSE for test set is: ",test_error)

MSE for test set is:  0.8133500415199731
