In [49]:
import pandas as pd
import numpy as np
from utils import reduce_memory_usage
from utils import break_datetime
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
import joblib
import zipfile
import os

warnings.filterwarnings('ignore')

# Loading data

In [3]:
train = pd.read_csv('/Users/goksuuzunturk/Desktop/DI 502 Project/FilteredDataset/train.csv').drop('Unnamed: 0',axis=1)
test =  pd.read_csv('/Users/goksuuzunturk/Desktop/DI 502 Project/FilteredDataset/test.csv').drop('Unnamed: 0',axis=1)

In [4]:
train['log_meter_reading']=np.log1p(train['meter_reading'])
train['log_square_feet']=np.log1p(train['square_feet'])
train= break_datetime(train)
train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,...,wind_direction,wind_speed,log_meter_reading,log_square_feet,hour,dayofweek,month,dayofyear,day,year
0,46,0,2016-01-01,53.2397,0,Retail,9045,2016.0,,25.0,...,0.0,0.0,3.993413,9.110078,0,4,1,1,1,2016
1,74,0,2016-01-01,43.0013,0,Parking,387638,1997.0,,25.0,...,0.0,0.0,3.784219,12.86783,0,4,1,1,1,2016
2,93,0,2016-01-01,52.4206,0,Office,33370,1982.0,,25.0,...,0.0,0.0,3.978196,10.415443,0,4,1,1,1,2016
3,105,0,2016-01-01,23.3036,1,Education,50623,,5.0,3.8,...,240.0,3.1,3.190624,10.832181,0,4,1,1,1,2016
4,106,0,2016-01-01,0.3746,1,Education,5374,,4.0,3.8,...,240.0,3.1,0.318163,8.589514,0,4,1,1,1,2016


In [5]:
test['log_square_feet']=np.log1p(test['square_feet'])
test= break_datetime(test)
test.head()

Unnamed: 0,row_id,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,...,sea_level_pressure,wind_direction,wind_speed,log_square_feet,hour,dayofweek,month,dayofyear,day,year
0,0,0,0,2017-01-01,0,Education,7432,2008.0,,17.8,...,1021.5,100.0,3.6,8.913685,0,6,1,1,1,2017
1,1,1,0,2017-01-01,0,Education,2720,2004.0,,17.8,...,1021.5,100.0,3.6,7.908755,0,6,1,1,1,2017
2,2,2,0,2017-01-01,0,Education,5376,1991.0,,17.8,...,1021.5,100.0,3.6,8.589886,0,6,1,1,1,2017
3,3,3,0,2017-01-01,0,Education,23685,2002.0,,17.8,...,1021.5,100.0,3.6,10.072639,0,6,1,1,1,2017
4,4,4,0,2017-01-01,0,Education,116607,1975.0,,17.8,...,1021.5,100.0,3.6,11.666573,0,6,1,1,1,2017


# Missing Value Imputation

In [6]:
def percent_missing_val(df):

  percent_missing = (df.isnull().sum() * 100) / len(df)
  missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
  return missing_value_df

In [7]:
missing_value_train= percent_missing_val(train)
missing_value_test= percent_missing_val(test)

In [8]:
missing_value_train

Unnamed: 0,column_name,percent_missing
building_id,building_id,0.0
meter,meter,0.0
timestamp,timestamp,0.0
meter_reading,meter_reading,0.0
site_id,site_id,0.0
primary_use,primary_use,0.0
square_feet,square_feet,0.0
year_built,year_built,54.890488
floor_count,floor_count,74.735249
air_temperature,air_temperature,0.398257


In [9]:
missing_value_test

Unnamed: 0,column_name,percent_missing
row_id,row_id,0.0
building_id,building_id,0.0
meter,meter,0.0
timestamp,timestamp,0.0
site_id,site_id,0.0
primary_use,primary_use,0.0
square_feet,square_feet,0.0
year_built,year_built,53.078556
floor_count,floor_count,75.088464
air_temperature,air_temperature,0.470646


As the year built and floor count columns have more than 50% values are missing, so we will drop these two columns.As the year built and floor count columns have more than 50% values are missing, so we will drop these two columns.

In [10]:
train.drop(['year_built', 'floor_count'], axis=1,inplace=True)
test.drop(['year_built', 'floor_count'], axis=1,inplace=True)

For the weather features, fill the missing values with the daily mean value of the corresponding feature in the site

In [11]:
def nan_fillers(df):
  air_temp_df=df.groupby(['site_id', 'day', 'month'])['air_temperature'].transform('mean')
  df['air_temperature'].fillna(air_temp_df, inplace=True)

  dew_temp_df=df.groupby(['site_id', 'day', 'month'])['dew_temperature'].transform('mean')
  df['dew_temperature'].fillna(dew_temp_df, inplace=True)

  cloud_df=df.groupby(['site_id', 'day', 'month'])['cloud_coverage'].transform('mean')
  df['cloud_coverage'].fillna(cloud_df, inplace=True)

  sea_level_df=df.groupby(['site_id', 'day', 'month'])['sea_level_pressure'].transform('mean')
  df['sea_level_pressure'].fillna(sea_level_df, inplace=True)

  precip_df=df.groupby(['site_id', 'day', 'month'])['precip_depth_1_hr'].transform('mean')
  df['precip_depth_1_hr'].fillna(precip_df, inplace=True)

  wind_dir_df=df.groupby(['site_id', 'day', 'month'])['wind_direction'].transform('mean')
  df['wind_direction'].fillna(wind_dir_df, inplace=True)

  wind_speed_df=df.groupby(['site_id', 'day', 'month'])['wind_speed'].transform('mean')
  df['wind_speed'].fillna(wind_speed_df, inplace=True)


  return df

In [12]:
train= nan_fillers(train)

In [13]:
train.isnull().sum()

building_id                 0
meter                       0
timestamp                   0
meter_reading               0
site_id                     0
primary_use                 0
square_feet                 0
air_temperature             0
cloud_coverage         713295
dew_temperature             0
precip_depth_1_hr     2075909
sea_level_pressure     797538
wind_direction              0
wind_speed                  0
log_meter_reading           0
log_square_feet             0
hour                        0
dayofweek                   0
month                       0
dayofyear                   0
day                         0
year                        0
dtype: int64

As there were lot of slices where all the values were NAN in 3 columns namely: cloud_coverage, precip_depth_1_hr and sea_level_pressure hence we will impute the rest of the nan values with the median value.

In [14]:
train['cloud_coverage'].fillna(train['cloud_coverage'].median(), inplace=True)
train['sea_level_pressure'].fillna(train['sea_level_pressure'].median(), inplace=True)
train['precip_depth_1_hr'].fillna(train['precip_depth_1_hr'].median(), inplace=True)

Proceed with the test data

In [15]:
test=nan_fillers(test)

In [16]:
test.isnull().sum()

row_id                      0
building_id                 0
meter                       0
timestamp                   0
site_id                     0
primary_use                 0
square_feet                 0
air_temperature             0
cloud_coverage         583392
dew_temperature             0
precip_depth_1_hr     3449328
sea_level_pressure    1559280
wind_direction              0
wind_speed                  0
log_square_feet             0
hour                        0
dayofweek                   0
month                       0
dayofyear                   0
day                         0
year                        0
dtype: int64

In [17]:
test['cloud_coverage'].fillna(test['cloud_coverage'].median(), inplace=True)
test['sea_level_pressure'].fillna(test['sea_level_pressure'].median(), inplace=True)
test['precip_depth_1_hr'].fillna(test['precip_depth_1_hr'].median(), inplace=True)

# Train - CV Split

Split data so that first 8 months will be train set and last two months will be test set

In [30]:
train=train.sort_values(by='timestamp')
X_train, X_cv= train_test_split(train, test_size=0.20, shuffle=False)

In [31]:
y_train = X_train['log_meter_reading']
y_cv = X_cv['log_meter_reading']
X_train.drop(['meter_reading', 'log_meter_reading'], axis=1, inplace=True)
X_cv.drop(['meter_reading', 'log_meter_reading'], axis=1, inplace=True)

# Label Encoding

In [32]:
label_enc= LabelEncoder()
label_enc.fit(train['primary_use'])
X_train['primary_use']= label_enc.transform(X_train['primary_use'])
X_cv['primary_use']= label_enc.transform(X_cv['primary_use'])
test['primary_use']= label_enc.transform(test['primary_use'])

In [33]:
X_train.head()

Unnamed: 0,building_id,meter,timestamp,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,log_square_feet,hour,dayofweek,month,dayofyear,day,year
0,46,0,2016-01-01,0,11,9045,25.0,6.0,20.0,-0.138889,1019.5,0.0,0.0,9.110078,0,4,1,1,1,2016
6,108,0,2016-01-01,1,0,81580,3.8,0.0,2.4,0.0,1021.0,240.0,3.1,11.309352,0,4,1,1,1,2016
576,695,0,2016-01-01,5,0,121062,6.913043,0.0,5.434783,0.0,1016.5,123.181818,8.017391,11.704066,0,4,1,1,1,2016
1,74,0,2016-01-01,0,8,387638,25.0,6.0,20.0,-0.138889,1019.5,0.0,0.0,12.86783,0,4,1,1,1,2016
2,93,0,2016-01-01,0,6,33370,25.0,6.0,20.0,-0.138889,1019.5,0.0,0.0,10.415443,0,4,1,1,1,2016


# Baseline Model: Decision Tree Regressor

For baseline model, we will use building id, primary use, square feet and air temperature as features and predict the consumption in hourly 
* The building id and site id are highly correlated, so we will use one of them
* The air temperature and dew temperature are highly correlated, so we will use one of them
* square feet has positive correalation with the meter readings, so we will use as a feature


In [34]:
features = ['building_id','air_temperature','square_feet','primary_use','hour','dayofweek','month']

In [38]:
DTR = DecisionTreeRegressor()
DTR.fit(X_train[features],y_train)

In [42]:
y_pred_train = DTR.predict(X_train[features])
train_error = mean_squared_error(y_train,y_pred_train)
print("MSE for train set is: ",train_error)

MSE for train set is:  0.005864063212601677


In [43]:
y_pred_cv = DTR.predict(X_cv[features])
test_error = mean_squared_error(y_cv,y_pred_cv)
print("MSE for test set is: ",test_error)

MSE for test set is:  0.20827355679654808


Save the model on your own computer since its size is large for github

In [60]:
# Specify the zip file name
zip_filename = "/Users/goksuuzunturk/Desktop/DI 502 Project/models/DTR_v0.zip"

# Create a ZIP file and add the model object to it
with zipfile.ZipFile(zip_filename, "w", zipfile.ZIP_DEFLATED) as archive:
    # Save the model to a temporary file
    temp_model_filename = "temp_model.pkl"
    joblib.dump(DTR, temp_model_filename)
    
    # Add the temporary model file to the ZIP archive
    archive.write(temp_model_filename, arcname="DTR_v0.pkl")

# Remove the temporary model file
os.remove(temp_model_filename)


Reuse the model by loading

In [61]:
# Specify the ZIP file name
zip_filename = "/Users/goksuuzunturk/Desktop/DI 502 Project/models/DTR_v0.zip"

# Extract the model file from the ZIP archive
with zipfile.ZipFile(zip_filename, "r") as archive:
    # Extract the model file (named "your_model.pkl" in this example)
    archive.extract("DTR_v0.pkl")
    
# Load the model
model = joblib.load("DTR_v0.pkl")  # Replace with "pickle.load" if you used pickle

os.remove("DTR_v0.pkl")

# You can now use the "model" for predictions or other tasks
y_pred_cv = DTR.predict(X_cv[features])
test_error = mean_squared_error(y_cv,y_pred_cv)
print("MSE for test set is: ",test_error)

MSE for test set is:  0.20827355679654808
