In [2]:
# start of the model

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, VotingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import joblib

# Load the train.csv dataset (in chunks)
chunksize = 10**6  # Read in chunks of 1 million rows
train_data = pd.read_csv('train.csv', chunksize=chunksize)
weather_data = pd.read_csv('weather_train.csv')

# Take a look at the first few rows of both datasets
for chunk in train_data:
    print(chunk.head())
    break  # Only show the first chunk for inspection

print(weather_data.head())

   building_id  meter            timestamp  meter_reading
0            0      0  2016-01-01 00:00:00            0.0
1            1      0  2016-01-01 00:00:00            0.0
2            2      0  2016-01-01 00:00:00            0.0
3            3      0  2016-01-01 00:00:00            0.0
4            4      0  2016-01-01 00:00:00            0.0
   site_id            timestamp  air_temperature  cloud_coverage  \
0        0  2016-01-01 00:00:00             25.0             6.0   
1        0  2016-01-01 01:00:00             24.4             NaN   
2        0  2016-01-01 02:00:00             22.8             2.0   
3        0  2016-01-01 03:00:00             21.1             2.0   
4        0  2016-01-01 04:00:00             20.0             2.0   

   dew_temperature  precip_depth_1_hr  sea_level_pressure  wind_direction  \
0             20.0                NaN              1019.7             0.0   
1             21.1               -1.0              1020.2            70.0   
2           

In [4]:
# Merge train data with weather data

# Load the building metadata
building_data = pd.read_csv('building_metadata.csv')

# Load the weather data (already done in previous steps, but showing for clarity)
weather_data = pd.read_csv('weather_train.csv')

# For demonstration, use the first chunk of train data (chunk from the for-loop)
train_chunk = next(train_data)

# Merge train data with building metadata using 'building_id'
train_building_merged = pd.merge(train_chunk, building_data, on='building_id', how='left')

# Merge the result with weather data using 'site_id' and 'timestamp'
train_full_merged = pd.merge(train_building_merged, weather_data, on=['site_id', 'timestamp'], how='left')

# Display the first few rows of the merged dataset
print(train_full_merged.head())

   building_id  meter            timestamp  meter_reading  site_id  \
0         1246      0  2016-01-19 03:00:00           49.0       14   
1         1246      1  2016-01-19 03:00:00            0.0       14   
2         1246      3  2016-01-19 03:00:00            0.0       14   
3         1247      0  2016-01-19 03:00:00          194.4       14   
4         1247      1  2016-01-19 03:00:00            0.0       14   

              primary_use  square_feet  year_built  floor_count  \
0                  Office        50327         NaN          NaN   
1                  Office        50327         NaN          NaN   
2                  Office        50327         NaN          NaN   
3  Food sales and service        69876         NaN          NaN   
4  Food sales and service        69876         NaN          NaN   

   air_temperature  cloud_coverage  dew_temperature  precip_depth_1_hr  \
0             -8.3             0.0            -17.8                0.0   
1             -8.3          

In [5]:
# Step 4: Handling Missing Data

# Check for missing values in the merged dataset
print("Missing values before handling:")
print(train_full_merged.isnull().sum())

# Handling missing values
# Numerical columns: 'square_feet', 'year_built', 'floor_count' can be filled with median
train_full_merged['square_feet'].fillna(train_full_merged['square_feet'].median(), inplace=True)
train_full_merged['year_built'].fillna(train_full_merged['year_built'].median(), inplace=True)
train_full_merged['floor_count'].fillna(train_full_merged['floor_count'].median(), inplace=True)

# Categorical column: 'primary_use' can be filled with the most frequent category
train_full_merged['primary_use'].fillna(train_full_merged['primary_use'].mode()[0], inplace=True)

# Verify if missing values are handled
print("\nMissing values after handling:")
print(train_full_merged.isnull().sum())

Missing values before handling:
building_id                0
meter                      0
timestamp                  0
meter_reading              0
site_id                    0
primary_use                0
square_feet                0
year_built            597658
floor_count           825059
air_temperature         3922
cloud_coverage        480936
dew_temperature         4532
precip_depth_1_hr     175470
sea_level_pressure     66217
wind_direction         45872
wind_speed              6367
dtype: int64

Missing values after handling:
building_id                0
meter                      0
timestamp                  0
meter_reading              0
site_id                    0
primary_use                0
square_feet                0
year_built                 0
floor_count                0
air_temperature         3922
cloud_coverage        480936
dew_temperature         4532
precip_depth_1_hr     175470
sea_level_pressure     66217
wind_direction         45872
wind_speed              

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_full_merged['square_feet'].fillna(train_full_merged['square_feet'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_full_merged['year_built'].fillna(train_full_merged['year_built'].median(), inplace=True)
The behavior will change in pandas 3.0. 

In [6]:
# Feature Engineering (step 5 technically..)

# Creating additional features
train_full_merged['log_square_feet'] = np.log1p(train_full_merged['square_feet'])  # Log transform of square feet
train_full_merged['year_built_age'] = 2024 - train_full_merged['year_built']  # Building age as a feature

# Interaction terms (example: product of air temperature and square feet)
train_full_merged['temp_x_sqft'] = train_full_merged['air_temperature'] * train_full_merged['square_feet']

# Dropping irrelevant columns (e.g., 'timestamp' or those filled with too many missing values)
train_full_merged.drop(columns=['timestamp', 'cloud_coverage'], inplace=True)  # Dropping timestamp and cloud_coverage

# Checking the final data after feature engineering
print("\nData after feature engineering:")
print(train_full_merged.head())


Data after feature engineering:
   building_id  meter  meter_reading  site_id             primary_use  \
0         1246      0           49.0       14                  Office   
1         1246      1            0.0       14                  Office   
2         1246      3            0.0       14                  Office   
3         1247      0          194.4       14  Food sales and service   
4         1247      1            0.0       14  Food sales and service   

   square_feet  year_built  floor_count  air_temperature  dew_temperature  \
0        50327      1968.0          3.0             -8.3            -17.8   
1        50327      1968.0          3.0             -8.3            -17.8   
2        50327      1968.0          3.0             -8.3            -17.8   
3        69876      1968.0          3.0             -8.3            -17.8   
4        69876      1968.0          3.0             -8.3            -17.8   

   precip_depth_1_hr  sea_level_pressure  wind_direction  wind_sp

In [7]:
# Now, part 6, "Preparing data for training"

# Define the target variable and the features
target = train_full_merged['meter_reading']
features = train_full_merged.drop(columns=['meter_reading', 'primary_use'])  # Drop target and unnecessary columns

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Display the shape of the training and testing sets
print("Training set shape (X_train, y_train):", X_train.shape, y_train.shape)
print("Testing set shape (X_test, y_test):", X_test.shape, y_test.shape)

Training set shape (X_train, y_train): (800000, 15) (800000,)
Testing set shape (X_test, y_test): (200000, 15) (200000,)


In [None]:
# imputation of missing values (cuz gradident boosting can't handle missing values)

# Fill remaining NaN values in weather-related features with their medians
train_full_merged['air_temperature'].fillna(train_full_merged['air_temperature'].median(), inplace=True)
train_full_merged['dew_temperature'].fillna(train_full_merged['dew_temperature'].median(), inplace=True)
train_full_merged['precip_depth_1_hr'].fillna(train_full_merged['precip_depth_1_hr'].median(), inplace=True)
train_full_merged['sea_level_pressure'].fillna(train_full_merged['sea_level_pressure'].median(), inplace=True)
train_full_merged['wind_direction'].fillna(train_full_merged['wind_direction'].median(), inplace=True)
train_full_merged['wind_speed'].fillna(train_full_merged['wind_speed'].median(), inplace=True)

# Verify again if all missing values have been handled
print("\nMissing values after final handling:")
print(train_full_merged.isnull().sum())

In [10]:
# Recalculate temp_x_sqft after filling missing values for air_temperature and square_feet

# Because there's still 3000 missing values in square_feet, my way is to fill them with the median
train_full_merged['temp_x_sqft'] = train_full_merged['air_temperature'] * train_full_merged['square_feet']

# Verify again if all missing values have been handled
print("\nMissing values after recalculating temp_x_sqft:")
print(train_full_merged.isnull().sum())


Missing values after recalculating temp_x_sqft:
building_id           0
meter                 0
meter_reading         0
site_id               0
primary_use           0
square_feet           0
year_built            0
floor_count           0
air_temperature       0
dew_temperature       0
precip_depth_1_hr     0
sea_level_pressure    0
wind_direction        0
wind_speed            0
log_square_feet       0
year_built_age        0
temp_x_sqft           0
dtype: int64


In [12]:
# Re-impute the missing values in the training and testing sets again

# Check for any NaN values in X_train and X_test
print("\nChecking for NaN values in X_train:")
print(X_train.isnull().sum())
print("\nChecking for NaN values in X_test:")
print(X_test.isnull().sum())

# Reimpute any remaining NaN values if they exist
from sklearn.impute import SimpleImputer

# Imputer to handle NaN values by filling them with the median for numeric features
imputer = SimpleImputer(strategy="median")

# Apply the imputer on X_train and X_test
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Verify that there are no more NaN values
print("\nChecking for NaN values in X_train_imputed:")
print(X_train_imputed.isnull().sum())
print("\nChecking for NaN values in X_test_imputed:")
print(X_test_imputed.isnull().sum())


Checking for NaN values in X_train:
building_id                0
meter                      0
site_id                    0
square_feet                0
year_built                 0
floor_count                0
air_temperature         3171
dew_temperature         3664
precip_depth_1_hr     140385
sea_level_pressure     52948
wind_direction         36677
wind_speed              5168
log_square_feet            0
year_built_age             0
temp_x_sqft             3171
dtype: int64

Checking for NaN values in X_test:
building_id               0
meter                     0
site_id                   0
square_feet               0
year_built                0
floor_count               0
air_temperature         751
dew_temperature         868
precip_depth_1_hr     35085
sea_level_pressure    13269
wind_direction         9195
wind_speed             1199
log_square_feet           0
year_built_age            0
temp_x_sqft             751
dtype: int64

Checking for NaN values in X_train_imputed:
b

In [13]:
# Now proceed with model training
gbr = GradientBoostingRegressor()
voting_reg = VotingRegressor(estimators=[
    ('gbr', gbr), 
    ('dtr', DecisionTreeRegressor())
])

# Train the Gradient Boosting Regressor
print("\nTraining Gradient Boosting Regressor...")
gbr.fit(X_train_imputed, y_train)

# Train the Voting Regressor
print("\nTraining Voting Regressor...")
voting_reg.fit(X_train_imputed, y_train)

# Evaluate Gradient Boosting Regressor
print("\nEvaluating Gradient Boosting Regressor...")
gbr_pred = gbr.predict(X_test_imputed)
gbr_mse = mean_squared_error(y_test, gbr_pred)
print(f"Gradient Boosting MSE: {gbr_mse}")

# Evaluate Voting Regressor
print("\nEvaluating Voting Regressor...")
voting_pred = voting_reg.predict(X_test_imputed)
voting_mse = mean_squared_error(y_test, voting_pred)
print(f"Voting Regressor MSE: {voting_mse}")


Training Gradient Boosting Regressor...

Training Voting Regressor...

Evaluating Gradient Boosting Regressor...
Gradient Boosting MSE: 905031507.5071758

Evaluating Voting Regressor...
Voting Regressor MSE: 614158642.5157735


In [14]:
# Check feature importance for Gradient Boosting Regressor ( because the MSE is higher than the voting regressor)
feature_importance = gbr.feature_importances_
feature_names = X_train.columns

# Sort features by importance
sorted_idx = np.argsort(feature_importance)[::-1]

print("\nFeature Importance (Gradient Boosting Regressor):")
for idx in sorted_idx:
    print(f"{feature_names[idx]}: {feature_importance[idx]}")


Feature Importance (Gradient Boosting Regressor):
square_feet: 0.32763620763906454
meter: 0.3141591934638807
log_square_feet: 0.14522186360261902
wind_speed: 0.1308127285997192
temp_x_sqft: 0.036608945962158544
wind_direction: 0.02787938713740081
sea_level_pressure: 0.014726105912224424
building_id: 0.0027125516351453106
dew_temperature: 0.0001511537874743181
precip_depth_1_hr: 6.763662718313961e-05
air_temperature: 1.3339209890905299e-05
floor_count: 7.065440060764971e-06
site_id: 3.8209831782511335e-06
year_built_age: 0.0
year_built: 0.0


Further model improvement (start with preprocessing, then hyperparameter tuning, random forest clc)

In [None]:
# Check for any remaining NaN values in X_train (just to be sure)
print("Checking for remaining NaN values in X_train:")
print(X_train.isnull().sum())

# Use SimpleImputer to fill any remaining missing values in X_train and X_test
from sklearn.impute import SimpleImputer

# Impute missing values with median strategy for numeric columns
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


In [17]:
from sklearn.impute import SimpleImputer

# Impute the missing weather-related features with the median
imputer = SimpleImputer(strategy='median')

# Impute the weather features (air_temperature, dew_temperature, etc.)
weather_columns = ['air_temperature', 'dew_temperature', 'precip_depth_1_hr', 
                   'sea_level_pressure', 'wind_direction', 'wind_speed', 'temp_x_sqft']
X_train[weather_columns] = imputer.fit_transform(X_train[weather_columns])
X_test[weather_columns] = imputer.transform(X_test[weather_columns])

# Verify that missing values are handled
print("Remaining missing values after imputation:")
print(X_train.isnull().sum())


Remaining missing values after imputation:
building_id           0
meter                 0
site_id               0
square_feet           0
year_built            0
floor_count           0
air_temperature       0
dew_temperature       0
precip_depth_1_hr     0
sea_level_pressure    0
wind_direction        0
wind_speed            0
log_square_feet       0
year_built_age        0
temp_x_sqft           0
dtype: int64


In [18]:
# GBR Hyperparameter tuning using grid search
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0]
}

# Perform grid search for Gradient Boosting Regressor
grid_search = GridSearchCV(GradientBoostingRegressor(), param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model on the training data
grid_search.fit(X_train, y_train)

# Output the best parameters from grid search
print("Best parameters found by GridSearchCV:")
print(grid_search.best_params_)

# Evaluate the tuned model
tuned_gbr = grid_search.best_estimator_
y_pred = tuned_gbr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Tuned Gradient Boosting Regressor MSE: {mse}")

Best parameters found by GridSearchCV:
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
Tuned Gradient Boosting Regressor MSE: 590795476.2385654


In [19]:
# importing sklearn library to use the GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor, VotingRegressor
from sklearn.tree import DecisionTreeRegressor

# X_train and y_train are the full datasets with all missing values handled properly already

# 1. Retrain the Gradient Boosting Regressor (GBR) on the full dataset using the proper hyperparameters
print("Retraining Gradient Boosting Regressor with best hyperparameters on the full dataset...")
gbr = GradientBoostingRegressor(learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8)
gbr.fit(X_train, y_train)

# 2. Retrain the Voting Regressor (if using multiple models)
print("Retraining Voting Regressor with Gradient Boosting and Decision Tree on the full dataset...")
decision_tree = DecisionTreeRegressor(max_depth=5)  # Adjust as needed, or use other regressors

# Combine Gradient Boosting Regressor and Decision Tree into a Voting Regressor
voting_regressor = VotingRegressor([('gbr', gbr), ('dt', decision_tree)])
voting_regressor.fit(X_train, y_train)

Retraining Gradient Boosting Regressor with best hyperparameters on the full dataset...
Retraining Voting Regressor with Gradient Boosting and Decision Tree on the full dataset...


In [20]:
from sklearn.ensemble import RandomForestRegressor

# Add Random Forest Regressor to the ensemble
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

# Update the Voting Regressor with Gradient Boosting, Decision Tree, and Random Forest
voting_regressor = VotingRegressor([('gbr', gbr), ('dt', decision_tree), ('rf', random_forest)])

# Retrain the Voting Regressor with the full dataset
print("Retraining Voting Regressor with Gradient Boosting, Decision Tree, and Random Forest on the full dataset...")
voting_regressor.fit(X_train, y_train)

# Evaluate the Voting Regressor
print("\nEvaluating Voting Regressor with Random Forest...")
y_pred_voting = voting_regressor.predict(X_test)
voting_mse = mean_squared_error(y_test, y_pred_voting)
print("Voting Regressor MSE (with Random Forest):", voting_mse)

# Save the model after evaluation
import joblib
joblib.dump(voting_regressor, 'voting_regressor_with_rf.pkl')
print("Model saved as 'voting_regressor_with_rf.pkl'")


Retraining Voting Regressor with Gradient Boosting, Decision Tree, and Random Forest on the full dataset...

Evaluating Voting Regressor with Random Forest...
Voting Regressor MSE (with Random Forest): 624148109.0073601
Model saved as 'voting_regressor_with_rf.pkl'
