In [16]:
import pandas as pd
import numpy as np
import lightgbm as lgb

from lightgbm import LGBMRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import gc
import warnings
from sklearn.model_selection import TimeSeriesSplit
from sklearn.base import BaseEstimator, RegressorMixin, clone

In [36]:
train_df = pd.read_csv('train_final.csv')
test_df = pd.read_csv('test_final.csv')

In [37]:
# The two column with null values do not have a high feature importance, so try dropping them

col_to_drop = ['year_built', 'floor_count']

train = train_df.drop(columns = col_to_drop)
test = test_df.drop(columns = col_to_drop)

**Per-meter models**

Original code from Kaggle notebook: https://www.kaggle.com/purist1024/strategy-evaluation-what-helps-and-by-how-much by @purist1024

The CatSplitRegressor wrapper class hides the process of splitting the training and validation data up by unique values of a specified column (i.e. "meter"), using a different sub-model for each value, and then re-integrating the resulting predictions.

In [38]:
class CatSplitRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, model, col):
        self.model = model
        self.col = col

    def fit(self, X, y):
        self.fitted = {}
        importances = []
        for val in X[self.col].unique():
            X1 = X[X[self.col] == val].drop(columns=[self.col])
            self.fitted[val] = clone(self.model).fit(X1, y.reindex_like(X1))
            importances.append(self.fitted[val].feature_importances_)
            del X1
        fi = np.average(importances, axis=0)
        col_index = list(X.columns).index(self.col)
        self.feature_importances_ = [*fi[:col_index], 0, *fi[col_index:]]
        return self

    def predict(self, X):
        result = np.zeros(len(X))
        for val in X[self.col].unique():
            ix = np.nonzero((X[self.col] == val).to_numpy())
            predictions = self.fitted[val].predict(X.iloc[ix].drop(columns=[self.col]))
            result[ix] = predictions
        return result
    
    def get_score(self):
        return self.model.best_score

In [39]:
categorical_columns = ["building_id", "site_id", "meter", "primary_use", "day", "month", "hour","weekday", "is_holiday", "weekend"]

**LGBM category awarness**

Original code from Kaggle notebook: https://www.kaggle.com/purist1024/strategy-evaluation-what-helps-and-by-how-much by @purist1024

The LGBM uses some very effective strategies to produce better splits when it knows that a feature is categorical. Since we have a reasonable number of integer-valued features that are, in fact, unordered or cyclical categoricals, we benefit from explicitly declaring them to LGBM.

Unfortunately, the scikit wrapper from LGBM doesn't handle these declarations gracefully, and prints a warning that we've overridden its default guesses. We include extra code here to silence those warnings.

In [43]:
class LGBMWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, categorical_feature=None, **params):
        self.model = LGBMRegressor(**params)
        self.categorical_feature = categorical_feature

    def fit(self, X, y):
        with warnings.catch_warnings():
            cats = None if self.categorical_feature is None else list(
                X.columns.intersection(self.categorical_feature))
            warnings.filterwarnings("ignore",
                                    "categorical_feature in Dataset is overridden".lower())
            self.model.fit(X, y, **({} if cats is None else {"categorical_feature": cats}))
            self.feature_importances_ = self.model.feature_importances_
            return self

    def predict(self, X):
        return self.model.predict(X)

    def get_params(self, deep=True):
        return {**self.model.get_params(deep), "categorical_feature": self.categorical_feature}

    def set_params(self, **params):
        ctf = params.pop("categorical_feature", None)
        if ctf is not None: self.categorical_feature = ctf
        self.model.set_params(params)

In [41]:
target = np.log1p(train["meter_reading"])
features = train.drop('meter_reading', axis = 1)
del train
gc.collect()

280

In [44]:
model = CatSplitRegressor(
    LGBMWrapper(random_state=0, n_jobs=-1, 
                reg_alpha=3.9, 
                reg_lambda=2.3, 
                learning_rate = 0.06, 
                max_depth = 12, 
                min_child_weight =5.54,
                min_split_gain=0.0875,
                num_leaves=2936,
                categorical_feature=categorical_columns), "meter")

model.fit(features, target)

defaultdict(<class 'collections.OrderedDict'>, {})
[51676   585   418 11594  9500 29236 17778  3956 16470  1612 11616  4451
  1693 22825  6605  9062     0  3063     0]
None
defaultdict(<class 'collections.OrderedDict'>, {})
[26919    54    80 12845  9967 16202 17155  7831 20197  3317 26361 11948
  5204 19429 16441 14393     0  2150     0]
None
defaultdict(<class 'collections.OrderedDict'>, {})
[47031   709   179 16444 10825 23214 21166  8879 23977  2881 22751 10238
  3731 21555 13236 13296     0  3161     0]
None
defaultdict(<class 'collections.OrderedDict'>, {})
[31973   312   289 13637  9298 16408 18407  9283 19854  3861 21653 11706
  5239 19544 14689 14906     0  2270     0]
None


CatSplitRegressor(col='meter',
                  model=LGBMWrapper(boosting_type='gbdt',
                                    categorical_feature=['building_id',
                                                         'site_id', 'meter',
                                                         'primary_use', 'day',
                                                         'month', 'hour',
                                                         'weekday',
                                                         'is_holiday',
                                                         'weekend'],
                                    class_weight=None, colsample_bytree=1.0,
                                    importance_type='split', learning_rate=0.06,
                                    max_depth=12, min_child_samples=20,
                                    min_child_weight=5.54,
                                    min_split_gain=0.0875, n_estimators=100,
                                   

In [45]:
test.head()

Unnamed: 0,row_id,building_id,meter,site_id,primary_use,square_feet,day,month,air_temperature,cloud_coverage,...,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,relative_humidity,feels_like,weekday,weekend,is_holiday
0,0,0,0,0,0,8.913685,6,1,17.8,4.0,...,0.4233,1021.5,100.0,3.6,0,67.5,12.26,6,1,0
1,1,1,0,0,0,7.908755,6,1,17.8,4.0,...,0.4233,1021.5,100.0,3.6,0,67.5,12.26,6,1,0
2,2,2,0,0,0,8.589886,6,1,17.8,4.0,...,0.4233,1021.5,100.0,3.6,0,67.5,12.26,6,1,0
3,3,3,0,0,0,10.072639,6,1,17.8,4.0,...,0.4233,1021.5,100.0,3.6,0,67.5,12.26,6,1,0
4,4,4,0,0,0,11.666573,6,1,17.8,4.0,...,0.4233,1021.5,100.0,3.6,0,67.5,12.26,6,1,0


In [21]:
test.drop(columns=['row_id'], inplace=True)

In [22]:
predictions = pd.DataFrame({
    "row_id": test.index,
    "meter_reading": np.clip(np.expm1(model.predict(test)), 0, None)
})

In [24]:
predictions.shape

(41697600, 2)

In [23]:
predictions.to_csv("submission6.csv", index=False, float_format="%.4f")

**Score: 1.09**

In [46]:
model_2 = CatSplitRegressor(
    LGBMWrapper(random_state=0, n_jobs=-1, 
                reg_alpha=3.85, 
                reg_lambda=2.26, 
                learning_rate = 0.06, 
                max_depth = 12, 
                min_child_weight =48,
                min_split_gain=0.0578,
                num_leaves=2150,
                categorical_feature=categorical_columns), "meter")

model_2.fit(features, target)

defaultdict(<class 'collections.OrderedDict'>, {})
[52223   559   390 11452  9227 29548 15749  3474 14338  1125  9434  3744
  1342 22940  5348  7217     0  2242     0]
None
defaultdict(<class 'collections.OrderedDict'>, {})
[26759    54    98 11030 10102 16116 14288  6419 16529  2462 21002  9406
  3948 19995 12574 11518     0  1466     0]
None
defaultdict(<class 'collections.OrderedDict'>, {})
[46232   745   158 13289 10720 22814 17380  6756 18378  1883 16265  7316
  2538 21093  9272  9603     0  2010     0]
None
defaultdict(<class 'collections.OrderedDict'>, {})
[31860   334   277 11421  9395 16723 14908  7446 16206  2630 16242  8730
  3674 19453 10726 11417     0  1515     0]
None


CatSplitRegressor(col='meter',
                  model=LGBMWrapper(boosting_type='gbdt',
                                    categorical_feature=['building_id',
                                                         'site_id', 'meter',
                                                         'primary_use', 'day',
                                                         'month', 'hour',
                                                         'weekday',
                                                         'is_holiday',
                                                         'weekend'],
                                    class_weight=None, colsample_bytree=1.0,
                                    importance_type='split', learning_rate=0.06,
                                    max_depth=12, min_child_samples=20,
                                    min_child_weight=48, min_split_gain=0.0578,
                                    n_estimators=100, n_jobs=-1,
                          

In [48]:
test.drop(columns=['row_id'], inplace=True)

In [49]:
predictions_2 = pd.DataFrame({
    "row_id": test.index,
    "meter_reading": np.clip(np.expm1(model_2.predict(test)), 0, None)
})

In [50]:
predictions_2.shape

(41697600, 2)

In [51]:
predictions_2.to_csv("submission7.csv", index=False, float_format="%.4f")

**Score: 1.09**