# Libraries, Functions

In [1]:
import gc, os, random, time, datetime
from pathlib import Path
from os.path import join as pjoin

import math
import numpy as np
from itertools import combinations

import pandas as pd
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

import matplotlib.pyplot as plt

from scipy import stats
from scipy.stats import ks_2samp

from meteocalc import feels_like, Temp

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error

import lightgbm as lgb

from bayes_opt import BayesianOptimization

import warnings
warnings.simplefilter('ignore')

In [2]:
debug = False

In [3]:
malign_building_site0 = [0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
                        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
                        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
                        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
                        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
                        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
                        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
                        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
                       104]

In [4]:
def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# Data Loading

## Train / Test, Building Data

In [5]:
%%time

train_df = pd.read_csv('train.csv', parse_dates=['timestamp'])
test_df = pd.read_csv('test.csv', parse_dates=['timestamp'])

weather_train_df = pd.read_csv('weather_train.csv', parse_dates=['timestamp'])
weather_test_df = pd.read_csv('weather_test.csv', parse_dates=['timestamp'])

building_meta_df = pd.read_csv('building_metadata.csv')

rows_to_drop = pd.read_csv('rows_to_drop.csv')
faulty_idxs = pd.read_csv('faulty_idxs.csv')
bidencoding = pd.read_csv('bidencoding.csv')

Wall time: 23.7 s


# Functions: Preprocessing & Feature Engineering

## Train / Test

In [6]:
def preprocessDf(df):
    
    # Hourly
    df['hour'] = df['timestamp'].dt.hour
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 23)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 23)
    
    # Weekly
    df['dayofweek'] = df['timestamp'].dt.dayofweek
    df['weekday_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 6)
    df['weekday_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 6)
    df['isweekend'] = np.where(df['dayofweek'] > 4, True, False)
    
    # Monthly
    df['month'] = df['timestamp'].dt.month
    
    # Seasonal
    temp = pd.Series([0,0,0,1,1,1,1,1,1,0,0,0])
    temp.index += 1
    df['season'] = df.month.map(temp)
    
    # Target Log Transformation
    train_df['meter_reading_log1p'] = np.log1p(train_df['meter_reading'])
    
    # Building Target Encoding
    df_group = train_df.groupby(['building_id'])['meter_reading_log1p']
    building_mean = df_group.mean().astype(np.float16)
    building_Q1 = df_group.quantile(0.25).astype(np.float16)
    building_median = df_group.median().astype(np.float16)
    building_Q3 = df_group.quantile(0.75).astype(np.float16)
    building_min = df_group.min().astype(np.float16)
    building_max = df_group.max().astype(np.float16)
    building_std = df_group.std().astype(np.float16)
    df['building__mean'] = df['building_id'].map(building_mean)
    df['building__median'] = df['building_id'].map(building_median)
    df['building__min'] = df['building_id'].map(building_min)
    df['building__max'] = df['building_id'].map(building_max)
    df['building__range'] = df['building__max'] - df['building__min']
    df['building__std'] = df['building_id'].map(building_std)
    df['building__count'] = df['building_id'].map(bidencoding[bidencoding.columns[0]])
    df['building__Q1'] = df['building_id'].map(building_Q1)
    df['building__Q3'] = df['building_id'].map(building_Q3)
    df['building__IQR'] = df['building__Q1'] - df['building__Q3']
    del df_group; gc.collect()
    
    # Hourly Target Encoding
    df_group = train_df.groupby('hour')['meter_reading_log1p']
    hour_mean = df_group.mean().astype(np.float16)
    hour_min = df_group.min().astype(np.float16)
    hour_max = df_group.max().astype(np.float16)
    hour_std = df_group.std().astype(np.float16)
    df['hour_mean'] = df['hour'].map(hour_mean)
    df['hour_min'] = df['hour'].map(hour_min)
    df['hour_max'] = df['hour'].map(hour_max)
    df['hour_std'] = df['hour'].map(hour_std)
    del df_group; gc.collect()

## Weather Data

In [7]:
def preprocessWeather_lag(df, window=3):
    
    group_df = df.groupby('site_id')
    cols = ['air_temperature', 'cloud_coverage', 'dew_temperature',
            'precip_depth_1_hr', 'sea_level_pressure',
            'wind_speed', 'feels_like_temperature']
    
    rolled = group_df[cols].rolling(window=window, min_periods=0)
    lag_mean = rolled.mean().reset_index().astype(np.float16)
    lag_max = rolled.max().reset_index().astype(np.float16)
    lag_std = rolled.std().reset_index().astype(np.float16)
    lag_Q1 = rolled.quantile(0.25).reset_index().astype(np.float16)
    lag_skew = rolled.skew().reset_index().astype(np.float16)
    
    for col in cols:
        df[f'{col}_mean_lag{window}'] = lag_mean[col]
        df[f'{col}_max_lag{window}'] = lag_max[col]
        df[f'{col}_std_lag{window}'] = lag_std[col]
        df[f'{col}_Q1_lag{window}'] = lag_Q1[col]
        df[f'{col}_skew_lag{window}'] = lag_skew[col]

In [8]:
def preprocessWeather(df):
    
    # Cyclical Wind Direction
    df['wind_direction_sin'] = np.sin((2 * np.pi * df['wind_direction']) / 360)
    df['wind_direction_cos'] = np.cos((2 * np.pi * df['wind_direction']) / 360)
    
    # Relative Humidity
    E = 6.11 * 10.0 ** (7.5 * df['dew_temperature'] / (237.7 + df['dew_temperature']))
    Es = 6.11 * 10.0 ** (7.5 * df['air_temperature'] / (237.7 + df['air_temperature']))    
    df['relative_humidity'] = (E / Es) * 100
    del E, Es; gc.collect()
    
    # Feels-Like Temperature
    temp = np.zeros((df.air_temperature.shape[0], 2))
    for i,j in enumerate(df.air_temperature.index):
        assert i == j, print('feels like temperature: index doesnt equal range')
        temp[i, 0] = j
        temp[i, 1] = feels_like(Temp(df.air_temperature[i], 'c'), df.relative_humidity[i], df.wind_speed[i]).c
    df['test_1'] = temp[:, 0]
    df['test_2'] = temp[:, 1]
    assert ((df.test_1.reset_index()['index'] == df.test_1).astype(int).sum() == df.test_1.shape[0]), print('oops')
    df['feels_like_temperature'] = df['test_2']
    del df['test_1'], df['test_2']; gc.collect()
    
    # Discomfort Index
    _1 = 0.81 * df['air_temperature']
    _2 = 0.01 * df['relative_humidity']
    _3 = 0.99 * df['air_temperature']
    df['discomfort_index'] = _1 + (_2 * (_3 - 14.3)) + 46.3
    del _1, _2, _3; gc.collect()
    
    cols = ['air_temperature', 'cloud_coverage', 'dew_temperature',
            'precip_depth_1_hr', 'sea_level_pressure',
            'wind_direction_sin', 'wind_direction_cos',
            'wind_speed', 'feels_like_temperature', 'site_id']
    
    # Time-Series Interpolation (Linear and 3rd-Order)
    temp_1 = df.groupby('site_id').apply(lambda group: group.interpolate(method='linear', limit_direction='both'))
    temp_2 = df.groupby('site_id').apply(lambda group: group.interpolate(method='polynomial', order=3, limit_direction='both'))
    temp = (temp_1[cols] + temp_2[cols]) / 2
    assert df[cols].shape[1] == temp.shape[1]
    df[cols] = temp
    del temp_1, temp_2
    for col in cols:
        temp[col].fillna(temp[col].median(), inplace=True)
    
    # Oneth Decimal Value
    modify = np.vectorize(math.modf)
    oneth, tenth = modify(df['feels_like_temperature'] / 10)
    df['feel_oneth'] = oneth * 10
    
    # Past Values
    for col in (set(cols) - {'site_id'}):
        df['{}_isnan'.format(col)] = df[col].isnull().astype(int)
        df[f'{col}_shift_12'] = df[col].shift(12)
        df[f'{col}_shift_24'] = df[col].shift(24)
        df[f'{col}_shift_48'] = df[col].shift(48)
        df[f'{col}_shift_72'] = df[col].shift(72)
    
    gc.collect()

## Building Meta-Data

In [9]:
def preprocessBuilding(df):
    df['square_feet'] = np.log1p(df['square_feet'])
    df['sqft-x-yearbuilt'] = df['square_feet'] / df['year_built']
    df['sqft_*_floorcount'] = df['square_feet'] * df['floor_count']
    gc.collect()

# Preprocessing & Feature Engineering

## Weather Data Timestamp Alignment

In [10]:
_dtypes = {
    'site_id': np.uint8,
    'air_temperature': np.float32,
    'cloud_coverage': np.float32,
    'dew_temperature': np.float32,
    'precip_depth_1_hr': np.float32,
    'sea_level_pressure': np.float32,
    'wind_direction': np.float32,
    'wind_speed': np.float32,
}

weather_train = pd.read_csv('weather_train.csv',
                            dtype=_dtypes,
                            parse_dates=['timestamp'])
weather_test = pd.read_csv('weather_test.csv',
                           dtype=_dtypes,
                           parse_dates=['timestamp'])
weather = pd.concat([weather_train, weather_test], ignore_index=True)
del _dtypes, weather_train, weather_test; gc.collect()

_key = ['site_id', 'timestamp']

temp = weather[_key + ['air_temperature']].drop_duplicates(subset=_key).sort_values(by=_key).copy()
temp['temp_rank'] = temp.groupby(['site_id', temp.timestamp.dt.date])['air_temperature'].rank('average')

temp = temp.groupby(['site_id', temp.timestamp.dt.hour])['temp_rank'].mean().unstack(level=1)
temp = pd.Series(temp.values.argmax(axis=1) - 14)
temp.index.name = 'site_id'

In [11]:
def timestamp_align(df, temp):
    
    df['offset'] = df.site_id.map(temp)
    df['timestamp_aligned'] = (df.timestamp - pd.to_timedelta(df.offset, unit='H'))
    df['timestamp'] = df['timestamp_aligned']
    
    del df['timestamp_aligned']; gc.collect()
    return df

In [12]:
%%time
weather_train_df = timestamp_align(weather_train_df, temp)
gc.collect()

Wall time: 69.5 ms


0

In [13]:
%%time
preprocessWeather(weather_train_df)
preprocessWeather_lag(weather_train_df, window=3)
#preprocessWeather_lag(weather_train_df, window=12)
preprocessWeather_lag(weather_train_df, window=48)
#preprocessWeather_lag(weather_train_df, window=72)
#preprocessWeather_lag(weather_train_df, window=120)

Wall time: 17.6 s


## Training Data

In [14]:
%%time

train_df['date'] = train_df['timestamp'].dt.date

try:
    train_df.drop(index=(rows_to_drop['0']), inplace=True)
except KeyError:
    pass

preprocessDf(train_df)

Wall time: 46.3 s


## Building Meta Data

In [15]:
primary_use_list = building_meta_df['primary_use'].unique()
primary_use_dict = {key: value for value, key in enumerate(primary_use_list)}
building_meta_df['primary_use'] = building_meta_df['primary_use'].map(primary_use_dict)
preprocessBuilding(building_meta_df)
del primary_use_list, primary_use_dict; gc.collect()

0

## Reduce Memory Usage

In [16]:
reduce_mem_usage(train_df, use_float16=True)
reduce_mem_usage(building_meta_df, use_float16=True)
reduce_mem_usage(weather_train_df, use_float16=True)
gc.collect()

Memory usage of dataframe is 2842.55 MB
Memory usage after optimization is: 1265.40 MB
Decreased by 55.5%
Memory usage of dataframe is 0.09 MB
Memory usage after optimization is: 0.02 MB
Decreased by 78.0%
Memory usage of dataframe is 78.91 MB
Memory usage after optimization is: 43.19 MB
Decreased by 45.3%


0

## Testing Data

In [17]:
%%time
preprocessDf(test_df)

Wall time: 46.6 s


## Weather Data (Timestamp Alignment & Features)

In [18]:
%%time
weather_test_df = timestamp_align(weather_test_df, temp)
preprocessWeather(weather_test_df)
preprocessWeather_lag(weather_test_df, window=3)
preprocessWeather_lag(weather_test_df, window=12)
preprocessWeather_lag(weather_test_df, window=48)
preprocessWeather_lag(weather_test_df, window=72)
preprocessWeather_lag(weather_test_df, window=120)

Wall time: 39 s


## Reduce Memory Usage

In [19]:
%%time
reduce_mem_usage(test_df, use_float16=True)
reduce_mem_usage(weather_test_df, use_float16=True)
gc.collect()

Memory usage of dataframe is 5209.34 MB
Memory usage after optimization is: 2266.66 MB
Decreased by 56.5%
Memory usage of dataframe is 212.05 MB
Memory usage after optimization is: 158.64 MB
Decreased by 25.2%
Wall time: 7.31 s


0

# Pre-Training Processing

In [20]:
category_cols = ['isweekend', 'building_id', 'site_id', 'primary_use']

In [21]:
common_cols = [
    'air_temperature', 'dew_temperature',
    'hour_cos', 'hour_sin',
    'air_temperature_mean_lag48',
    'weekday_sin', 'weekday_cos', 'dayofweek',
    'feels_like_temperature', 'square_feet', 'sqft_*_floorcount', 'sqft-x-yearbuilt', 'relative_humidity',
    'sea_level_pressure',
    'building__mean', 'building__median', 'building__Q3', 'building__Q1', 'building__min', 'building__IQR', 'building__range', 'building__max', 'building__std', 'building__count',
    'feels_like_temperature_Q1_lag48', 'feels_like_temperature_max_lag48',
    'feels_like_temperature_Q1_lag3', 'feels_like_temperature_max_lag3',
    'hour_mean', 'hour_min', 'hour_max', 'hour_std',
    'row_na'
]

In [22]:
def returnTrains(train_df, target_meter):
    
    target_train_df = train_df[train_df['meter'] == target_meter]
    target_train_df = target_train_df.merge(building_meta_df, on='building_id', how='left')
    target_train_df = target_train_df.merge(weather_train_df, on=['site_id', 'timestamp'], how='left')
    
    target_train_df['row_na'] = target_train_df.isna().sum(axis=1)
    target_train_df['feels_like_temperature'] = np.log1p(target_train_df['feels_like_temperature'])
    target_train_df['air_temperature'] = np.log1p(target_train_df['air_temperature'])
    target_train_df['dew_temperature'] = np.log1p(target_train_df['dew_temperature'])
    
    X_train_1 = target_train_df[common_cols + category_cols]
    y_train_1 = target_train_df['meter_reading_log1p'].values

    df = pd.Series(y_train_1).copy()
    temp = np.where(df==0, 0, 1)
    df = pd.DataFrame(X_train_1).copy()
    del X_train_1['building__min']
    
    del target_train_df
    gc.collect()
    return X_train_1, y_train_1, df, temp

In [23]:
def returnTests(test_df, target_meter, train_df):
    
    target_test_df = test_df[test_df['meter'] == target_meter]
    target_test_df = target_test_df.merge(building_meta_df, on='building_id', how='left')
    target_test_df = target_test_df.merge(weather_test_df, on=['site_id', 'timestamp'], how='left')
    
    target_train_df = train_df[train_df['meter'] == target_meter]
    target_train_df = target_train_df.merge(building_meta_df, on='building_id', how='left')
    target_train_df = target_train_df.merge(weather_train_df, on=['site_id', 'timestamp'], how='left')
    target_test_df['row_na'] = target_test_df.isna().sum(axis=1)
    target_test_df['feels_like_temperature'] = np.log1p(target_test_df['feels_like_temperature'])
    target_test_df['air_temperature'] = np.log1p(target_test_df['air_temperature'])
    target_test_df['dew_temperature'] = np.log1p(target_test_df['dew_temperature'])

    X_test = target_test_df[common_cols + category_cols]
    return X_test

In [24]:
%%time
X_train_0, y_train_0, x_temp_0, temp_0 = returnTrains(train_df, target_meter=0)
X_train_1, y_train_1, x_temp_1, temp_1 = returnTrains(train_df, target_meter=1)
X_train_2, y_train_2, x_temp_2, temp_2 = returnTrains(train_df, target_meter=2)
X_train_3, y_train_3, x_temp_3, temp_3 = returnTrains(train_df, target_meter=3)

Wall time: 2min 15s


# GBDT (Gradient-Boosting Decision Tree Regression Model): LightGBM

In [33]:
def fitLgbm(train, valid, seed=None,
            objective='regression', metric='huber',
            cat_features=None, num_rounds=3000, lr=0.3, bf=0.1, ff=0.7, l1=0, reg=2, num=32+16, hessian=1e-3, min_data=20, max_depth=-1):
    
    X_train, y_train = train
    X_valid, y_valid = valid
    
    params = {'num_leaves': num,
              'objective': objective,
              'learning_rate': lr,
              "boosting": "gbdt",
              "bagging_freq": 5,
              "bagging_fraction": bf,
              "feature_fraction": ff,
              'min_child_weight': hessian,   
              'min_data_in_leaf': min_data,
              'max_depth': max_depth,
              "metric": metric,
              'reg_alpha': l1,
              'reg_lambda': reg,
              'verbose': -1}
    
    params['seed'] = seed
    d_train = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_features)
    d_valid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_features)
    watchlist = [d_train, d_valid]

    evals_result = {}
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist, evals_result=evals_result,
                      verbose_eval=500, early_stopping_rounds=20)

    y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
    
    log = {'train/mae': model.best_score['training'][f'{metric}'],
           'valid/mae': model.best_score['valid_1'][f'{metric}']}
    
    return model, y_pred_valid, log, evals_result

In [26]:
def plot_feature_importance(model):
    importance_df = pd.DataFrame(model.feature_importance(),
                                 index=common_cols + category_cols,# + temp_cols,
                                 columns=['importance']).sort_values('importance')
    fig, ax = plt.subplots(figsize=(10, 13))
    importance_df.plot.barh(ax=ax)
    fig.show()

## External Prediction Function

In [27]:
def predictTest(X_test, models, batch_size=1_000_000):
    
    iterations = (X_test.shape[0] + batch_size -1) // batch_size
    
    y_test_pred_total = np.zeros(X_test.shape[0])
    for i, model in enumerate(models):
        print(f'predicting w/ {i}-th model')
        for k in range(iterations):
            y_pred_test = model.predict(X_test[k*batch_size:(k+1)*batch_size], num_iteration=model.best_iteration)
            y_test_pred_total[k*batch_size:(k+1)*batch_size] += y_pred_test

    y_test_pred_total /= len(models)
    return y_test_pred_total

# Train Models

In [45]:
folds = 2
seed = None
kf = KFold(n_splits=folds, shuffle=False)
tscv = TimeSeriesSplit(n_splits=folds)

In [34]:
target_meter = 0
y_valid_pred_total = np.zeros(X_train_0.shape[0])
added_col = ['feels_like_temperature_pct_change_3']

models0 = []
for train_idx, valid_idx in kf.split(X_train_0, y_train_0):
    
    train_data = X_train_0.iloc[train_idx,:], y_train_0[train_idx]
    valid_data = X_train_0.iloc[valid_idx,:], y_train_0[valid_idx]

    print('train:', len(train_idx), '| valid:', len(valid_idx))
    model, y_pred_valid, log, evals_result_0 = fitLgbm(train_data, valid_data,
                                                       seed=seed,
                                                       cat_features=category_cols,
                                                       num_rounds=350*5,#*5,
                                                       lr=0.05773181964275955,#/5,
                                                       bf=0.26796569889874977,
                                                       ff=0.3936272203346429,
                                                       l1=3.5478121904764905,
                                                       reg=28.620086849549573,
                                                       num=int(492.6066450470124),
                                                       hessian=0.02297650760194559,
                                                       min_data=int(167.2740900162622),
                                                       max_depth=int(0.642355829229224))
    models0.append(model)
    gc.collect()
    if debug:
        break

train: 5763682 | valid: 5763683
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[130]	training's huber: 0.0296853	valid_1's huber: 0.0621339
train: 5763683 | valid: 5763682
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[176]	training's huber: 0.0269232	valid_1's huber: 0.0605684


In [46]:
CV = [models0[0].best_score['valid_1']['huber'], models0[1].best_score['valid_1']['huber']]
TR = [models0[0].best_score['training']['huber'], models0[1].best_score['training']['huber']]

In [38]:
target_meter = 1
y_valid_pred_total = np.zeros(X_train_1.shape[0])

models1 = []
for train_idx, valid_idx in kf.split(X_train_1, y_train_1):
    
    train_data = X_train_1.iloc[train_idx,:], y_train_1[train_idx]
    valid_data = X_train_1.iloc[valid_idx,:], y_train_1[valid_idx]

    print('train:', len(train_idx), '| valid:', len(valid_idx))
    model, y_pred_valid, log, evals_result_1 = fitLgbm(train_data, valid_data,
                                                       seed=seed,
                                                       cat_features=category_cols,
                                                       num_rounds=235*5,#*5,
                                                       lr=0.03777627184627892,#/5,
                                                       bf=0.6441781260868563,
                                                       ff=0.6715453909673709,
                                                       l1=12.015431047748624,
                                                       reg=0.5414451954639196,
                                                       num=int(425.1833449814948),
                                                       hessian=0.02432008332439737,
                                                       min_data=int(130.06407988656792),
                                                       max_depth=int(0.2769423857727893))
    models1.append(model)
    gc.collect()
    if debug:
        break

train: 1954459 | valid: 1954460
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[210]	training's huber: 0.126169	valid_1's huber: 0.303959
train: 1954460 | valid: 1954459
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[193]	training's huber: 0.145252	valid_1's huber: 0.293306


In [47]:
CV.append(models1[0].best_score['valid_1']['huber'])
CV.append(models1[1].best_score['valid_1']['huber'])
TR.append(models1[0].best_score['training']['huber'])
TR.append(models1[1].best_score['training']['huber'])

In [40]:
target_meter = 2
y_valid_pred_total = np.zeros(X_train_2.shape[0])

models2 = []
for train_idx, valid_idx in kf.split(X_train_2, y_train_2):
    
    train_data = X_train_2.iloc[train_idx,:], y_train_2[train_idx]
    valid_data = X_train_2.iloc[valid_idx,:], y_train_2[valid_idx]

    print('train:', len(train_idx), '| valid:', len(valid_idx))
    model, y_pred_valid, log, evals_result_2 = fitLgbm(train_data, valid_data,
                                                       seed=seed,
                                                       cat_features=category_cols,
                                                       num_rounds=165*5,#*5,
                                                       lr=0.08646906661180581,#/5,
                                                       bf=0.7189802865959849,
                                                       ff=0.7085556468373223,
                                                       l1=22.81995553164059,
                                                       reg=6.38542313999785,
                                                       num=int(451.26369424324986),
                                                       hessian=0.39439907915778355,
                                                       min_data=int(25.424846207146917),
                                                       max_depth=int(24.84006803870966))
    models2.append(model)
    gc.collect()
    if debug:
        break

train: 1320197 | valid: 1320197
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[63]	training's huber: 0.235449	valid_1's huber: 0.391183
train: 1320197 | valid: 1320197
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[61]	training's huber: 0.22551	valid_1's huber: 0.427714


In [48]:
CV.append(models2[0].best_score['valid_1']['huber'])
CV.append(models2[1].best_score['valid_1']['huber'])
TR.append(models2[0].best_score['training']['huber'])
TR.append(models2[1].best_score['training']['huber'])

In [42]:
target_meter = 3
y_valid_pred_total = np.zeros(X_train_3.shape[0])

models3 = []
for train_idx, valid_idx in kf.split(X_train_3, y_train_3):
    train_data = X_train_3.iloc[train_idx,:], y_train_3[train_idx]
    valid_data = X_train_3.iloc[valid_idx,:], y_train_3[valid_idx]

    print('train:', len(train_idx), '| valid:', len(valid_idx))
    model, y_pred_valid, log, evals_result_3 = fitLgbm(train_data, valid_data,
                                                       seed=seed,
                                                       cat_features=category_cols,
                                                       num_rounds=175*5,#*5,
                                                       lr=0.12281564370554858,#/5,
                                                       bf=0.8421331995739937,
                                                       ff=0.28732134167029566,
                                                       l1=1.510625394641553,
                                                       reg=25.660908491738372,
                                                       num=int(28.514072092238653),
                                                       hessian=0.036216698203522454,
                                                       min_data=int(21.61115352418879),
                                                       max_depth=int(31.719112307783114))
    models3.append(model)
    gc.collect()
    if debug:
        break

train: 576581 | valid: 576582
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[173]	training's huber: 0.399342	valid_1's huber: 0.600572
train: 576582 | valid: 576581
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[162]	training's huber: 0.358418	valid_1's huber: 0.6095


In [49]:
CV.append(models3[0].best_score['valid_1']['huber'])
CV.append(models3[1].best_score['valid_1']['huber'])
TR.append(models3[0].best_score['training']['huber'])
TR.append(models3[1].best_score['training']['huber'])

## Cross-Validation Score

In [51]:
print('meter0: CVbar: {}. TRbar: {}'.format(np.mean(CV[:2]), np.mean(TR[:2])))
print('meter1: CVbar: {}. TRbar: {}'.format(np.mean(CV[2:4]), np.mean(TR[2:4])))
print('meter2: CVbar: {}. TRbar: {}'.format(np.mean(CV[4:6]), np.mean(TR[4:6])))
print('meter3: CVbar: {}. TRbar: {}'.format(np.mean(CV[6:]), np.mean(TR[6:])))
temp = np.mean(CV[:2]) * (11714696 / 19869886)
temp += np.mean(CV[2:4]) * (4182440 / 19869886)
temp += np.mean(CV[4:6]) * (2708713 / 19869886)
temp += np.mean(CV[6:]) * (1264037 / 19869886)
print("Cross Validation Score: {}".format(temp))

meter0: CVbar: 0.06135112777600039. TRbar: 0.028304226044513166
meter1: CVbar: 0.2986326935039836. TRbar: 0.13571045126706535
meter2: CVbar: 0.40944859533243094. TRbar: 0.23047992520925667
meter3: CVbar: 0.6050358110063654. TRbar: 0.3788802606388617
Cross Validation Score: 0.19333727018954966


# Predict

In [65]:
y_test = {}
for i in range(4):
    print(f'Generating meter #{i} test data')
    X_test = returnTests(test_df, target_meter=i, train_df=train_df)
    del X_test['building__min']
    y_test[i] = np.expm1(predictTest(X_test, models0))
    del X_test; gc.collect()

Generating meter #0 test data
predicting w/ 0-th model
predicting w/ 1-th model
Generating meter #1 test data
predicting w/ 0-th model
predicting w/ 1-th model
Generating meter #2 test data
predicting w/ 0-th model
predicting w/ 1-th model
Generating meter #3 test data
predicting w/ 0-th model
predicting w/ 1-th model


In [66]:
name = 'submission.csv'

In [68]:
sample_submission = pd.read_csv('sample_submission.csv')
reduce_mem_usage(sample_submission)

for i in range(4):
    sample_submission.loc[test_df['meter'] == i, 'meter_reading'] = y_test[i]

sample_submission['meter_reading'] = sample_submission['meter_reading'].clip(lower=0)
display(sample_submission)
# sample_submission.to_csv(name, index=False, float_format='%.4f')

Memory usage of dataframe is 636.26 MB
Memory usage after optimization is: 198.83 MB
Decreased by 68.7%


Unnamed: 0,row_id,meter_reading
0,0,194.289928
1,1,87.892822
2,2,9.588913
3,3,317.116782
4,4,1326.458052
...,...,...
41697595,41697595,6.424563
41697596,41697596,4.249002
41697597,41697597,6.953852
41697598,41697598,184.757973
