***Ashrae utility consumption - Kaggle Competition***

This notebook was created in order to participate in the Kaggle competition for the Ashrae utility consumption prediction. The challenge was based on predicting the next year levels of consumption for 4 utilities: hot water, cold water, steam and electricity; based on a 1 year historical data. The data was split in 5 datasets, 2 for weather (train and test), 1 for buildings identification, 1 for train data and 1 for test data. 

This project consisted on several steps on data exploration, cleaning and wrangling as well as several approaches on the modelling phase. Although it wasn't awarded any prizes, it was a very interesting experience and it served many ideas for application in real cases. 

In [0]:
'''
Importing of libraries used and definition of standards
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as ms
import io


import random
from datetime import datetime as dt
from time import time

import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

import gc
import os
from google.colab import files

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

import tqdm



pd.set_option('display.max_columns', 500)
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 100


#!apt-get -qq install --no-install-recommends nvidia-375
#!apt-get -qq install --no-install-recommends nvidia-opencl-icd-375 nvidia-opencl-dev opencl-headers
#!apt-get -qq install --no-install-recommends git cmake build-essential libboost-dev libboost-system-dev libboost-filesystem-dev
#!pip3 install -qq lightgbm --install-option=--gpu

  import pandas.util.testing as tm


In [0]:
'''
Link to the Google drive repository for importing the datasets
'''
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
'''
Reading the csv files for the training data and transforming them into pandas dataframes
'''
weather_train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Kaggle/data/weather_train.csv')

build = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Kaggle/data/building_metadata.csv').fillna(-1)
le = LabelEncoder()
build.primary_use = le.fit_transform(build.primary_use)

train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Kaggle/data/train.csv')
train_dates = train.timestamp.unique()

In [0]:
'''
Reading the csv files for the testing data and transforming them into pandas dataframes
'''
build = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Kaggle/data/building_metadata.csv').fillna(-1)
le = LabelEncoder()
build.primary_use = le.fit_transform(build.primary_use)

weather_test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Kaggle/data/weather_test.csv')

test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Kaggle/data/test.csv')
test_dates = test.timestamp.unique()

In [0]:
'''
Definition of the function to reduce the memory usage of the datasets so it can be manipulated without major problems.
This function was was based on this kernel https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65
'''
def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [0]:
'''
This function is responsible for normalizing the registered values from one of the buildings for which the readings were found completely out of the standards,
during the exploratory data analysis. The way of normalizing the data was based on scaling the readings so they were transformed to the same order as the
remaining dataset.
'''
def regularize_build(df):
    df.loc[df[(df.building_id==1099) & (df.meter==2)].index, 'meter_reading'] = df[
        (df.building_id==1099) & (df.meter==2)]['meter_reading'].values/1e4
    df.loc[df[df.building_id==1099][df.meter==2][
        df[df.building_id==1099][df.meter==2].meter_reading < 10].index, 'meter_reading'] = 0
    return df

In [0]:
'''
This function is responsible for the feature engineering process on the weather dataset. It is comprised of several steps, such as:
- regularizing negative values for some attributes that can't be negative(logically)
- creating a set of features that describes if the values were filled originally
- regularizing the time intervals recorded on the dataset
- filling the missing values with interpolations
'''
def weather_transform(weather, dates):
    weather['wind_direction'] = np.sin(weather.wind_direction*np.pi/180)
    weather.precip_depth_1_hr = weather.precip_depth_1_hr.apply(lambda x: x if x>= 0 else None)
    
    weather['had_air'] = (pd.isna(weather.air_temperature)==False)*1
    weather['had_cloud'] = (pd.isna(weather.cloud_coverage)==False)*1
    weather['had_dew'] = (pd.isna(weather.dew_temperature)==False)*1
    weather['had_precip'] = (pd.isna(weather.precip_depth_1_hr)==False)*1
    weather['had_pressure'] = (pd.isna(weather.sea_level_pressure)==False)*1
    weather['had_wind'] = (pd.isna(weather.wind_direction)==False)*1
    weather['had_speed'] = (pd.isna(weather.wind_speed)==False)*1

    weather = weather.set_index(pd.DatetimeIndex(weather['timestamp'])).groupby(
        'site_id').apply(lambda group: group.reindex(pd.DatetimeIndex(dates),fill_value=np.nan)).drop(
        'site_id', axis=1).reset_index()
    weather = weather.drop('timestamp', axis=1).rename(columns={"level_1": "timestamp"})
    weather = weather.groupby('site_id').apply(lambda group: group.interpolate(limit_direction='both'))

    weather['cloud_coverage'] = weather[['timestamp', 'cloud_coverage']].groupby(['timestamp']).apply(
        lambda group : group.fillna(group.mean()))['cloud_coverage'].values.round()
    weather['precip_depth_1_hr'] = weather[['timestamp', 'precip_depth_1_hr']].groupby(['timestamp']).apply(
        lambda group : group.fillna(group.mean()))['precip_depth_1_hr'].values
    weather['sea_level_pressure'] = weather[['timestamp', 'sea_level_pressure']].groupby(['timestamp']).apply(
        lambda group : group.fillna(group.mean()))['sea_level_pressure'].values
    
    return weather

In [0]:
'''
This function merges all three datasets, sorting the values at the end
'''
def df_merge(data, weather, build, dates):
    data['timestamp'] = pd.to_datetime(data.timestamp)
    
    data = data.set_index(pd.DatetimeIndex(data['timestamp'])).groupby(
            ['meter', 'building_id']).apply(lambda group: group.reindex(pd.DatetimeIndex(dates),fill_value=0)).drop(
            ['meter', 'building_id'], axis=1).reset_index().drop('timestamp',axis=1).rename(columns={"level_2": "timestamp"})
    
    data = pd.merge(pd.merge(data, build, left_on='building_id', right_on='building_id', how='left')
             , weather, left_on=['site_id', 'timestamp'], right_on=['site_id', 'timestamp'], how='left')
    
    data = data.sort_values(['meter', 'building_id', 'timestamp'])
    
    return data

In [0]:
'''
This function creates meaningful date/time features in order to capture sazonality or frequency of the behavior
'''
def time_columns(df):
    df['day_of_year'] = (df['timestamp'].dt.dayofyear-1)
    df['week_of_year'] = df['timestamp'].dt.week-1
    df['month'] = df['timestamp'].dt.month-1
    df['weekend'] = pd.cut(df['timestamp'].dt.weekday, bins=[-0.1, 4, 6], labels =[0, 1])
    df['day_period'] = pd.cut(df['timestamp'].dt.hour, bins=[-0.1, 6, 12, 18, 24], labels=[0, 1, 2, 3])
    df['season'] = pd.cut(df['timestamp'].dt.dayofyear + 11 - 366*(df['timestamp'].dt.dayofyear > 355),
                         bins=[0, 91, 183, 275, 366], labels=[0, 1, 2, 3])
    return df

In [0]:
'''
This function fills the missing values from the sensors readings using a set of of progressive transformations, trying to capture characteristics that
may be particular to seasons or locations.
'''
def df_transform(df):
    df['meter_reading'] = df['meter_reading'].replace(0, np.nan)
    
    df['meter_reading'] = df[['meter', 'building_id', 'season', 'meter_reading']].groupby(
        ['meter', 'building_id', 'season']).apply(lambda group: group.fillna(group.mean()))['meter_reading'].values
    df['meter_reading'] = df[['meter', 'building_id', 'meter_reading']].groupby(
        ['meter', 'building_id']).apply(lambda group: group.fillna(group.mean()))['meter_reading'].values
    
    df['meter_reading'] = df[['meter', 'building_id', 'timestamp', 'meter_reading']].groupby(
        ['meter', 'building_id']).apply(lambda group: group.set_index('timestamp').rolling('3H').median())['meter_reading'].values
    
    df['log_meter_reading'] = np.log1p(df["meter_reading"])
    
    return df

In [0]:
'''
This function creates a deep learning model using a Keras sequential one for predicting the results. The neural network uses a set of embedding layers for 
categorical features.
'''
from keras.models import Model, load_model
from keras.layers import Input, Dropout, Dense, Embedding, SpatialDropout1D, concatenate, BatchNormalization, Flatten
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras import backend as K
from keras.models import Model
from keras.losses import mean_squared_error as mse_loss

from keras import optimizers
from keras.optimizers import RMSprop, Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

categoricals = ["site_id", "building_id", "primary_use", "cloud_coverage", "hour", "weekday", 'day_of_year', 'week_of_year', 'month', 'day_period', 
                'season',  "meter"]

drop_cols = ["sea_level_pressure", "wind_direction"]

numericals = ["square_feet", "year_built", "air_temperature", "dew_temperature", "precip_depth_1_hr", "floor_count", 'wind_speed']

all_features = categoricals + numericals

def model(dense_dim_1=128, dense_dim_2=64, dense_dim_3=32, dense_dim_4=32, dense_dim_5=16
          dropout1=0.2, dropout2=0.1, dropout3=0.1, dropout4=0.1, lr=0.01):

    #Inputs
    site_id = Input(shape=[1], name="site_id")
    building_id = Input(shape=[1], name="building_id")
    primary_use = Input(shape=[1], name="primary_use")
    cloud_coverage = Input(shape=[1], name="cloud_coverage")
    hour = Input(shape=[1], name="hour")
    weekday = Input(shape=[1], name="weekday")
    day_of_year = Input(shape=[1], name="day_of_year")
    week_of_year = Input(shape=[1], name="week_of_year")
    month = Input(shape=[1], name="month")
    day_period = Input(shape=[1], name="day_period")
    season = Input(shape=[1], name="season")
    meter = Input(shape=[1], name="meter")
    
    square_feet = Input(shape=[1], name="square_feet")
    year_built = Input(shape=[1], name="year_built")
    air_temperature = Input(shape=[1], name="air_temperature")
    dew_temperature = Input(shape=[1], name="dew_temperature")
    precip = Input(shape=[1], name="precip_depth_1_hr")
    floor_count = Input(shape=[1], name="floor_count")
    wind_speed = Input(shape=[1], name="wind_speed")
   
    #Embeddings layers
    emb_site_id = Embedding(16, 8)(site_id)
    emb_building_id = Embedding(1449, 50)(building_id)
    emb_primary_use = Embedding(16, 8)(primary_use)
    emb_cloud_coverage = Embedding(16, 5)(cloud_coverage)
    emb_hour = Embedding(24, 12)(hour)
    emb_weekday = Embedding(7, 3)(weekday)
    emb_day_of_year = Embedding(366, 50)(day_of_year)
    emb_week_of_year = Embedding(52, 26)(week_of_year)
    emb_month = Embedding(12, 6)(month)
    emb_day_period = Embedding(4, 2)(day_period)
    emb_season = Embedding(4, 2)(season)
    emb_meter = Embedding(4, 2)(meter)

    concat_emb = concatenate([
           Flatten() (emb_site_id)
         , Flatten() (emb_building_id)
         , Flatten() (emb_primary_use)
         , Flatten() (emb_cloud_coverage)
         , Flatten() (emb_hour)
         , Flatten() (emb_weekday)
         , Flatten() (emb_day_of_year)
         , Flatten() (emb_week_of_year)
         , Flatten() (emb_month)
         , Flatten() (emb_day_period)
         , Flatten() (emb_season)
         , Flatten() (emb_meter)
    ])
    
    categ = Dropout(dropout1)(Dense(dense_dim_1,activation='relu') (concat_emb))
    categ = BatchNormalization()(categ)
    categ = Dropout(dropout2)(Dense(dense_dim_2,activation='relu') (categ))
    categ = BatchNormalization()(categ)
    categ = Dropout(dropout2)(Dense(dense_dim_3,activation='relu') (categ))

    #main layer
    main_l = concatenate([
          categ
        , square_feet
        , year_built
        , air_temperature
        , dew_temperature
        , precip
        , floor_count
        , wind_speed
    ])
    
    main_l = Dropout(dropout3)(Dense(dense_dim_4,activation='relu') (main_l))
    main_l = BatchNormalization()(main_l)
    main_l = Dropout(dropout4)(Dense(dense_dim_5,activation='relu') (main_l))
    
    #output
    output = Dense(1) (main_l)

    model = Model([site_id,
                   building_id, 
                   primary_use,
                   cloud_coverage,
                   hour,
                   weekday,
                   day_of_year,
                   week_of_year,
                   month,
                   day_period,
                   season,
                   meter,
                   square_feet,
                   year_built,
                   air_temperature,
                   dew_temperature,
                   precip,
                   floor_count,
                   wind_speed], output)

    model.compile(optimizer = Adam(lr=lr),
                  loss= mse_loss,
                  metrics=[root_mean_squared_error])
    return model

'''
This function represents the error function used to approximate the solutions
'''
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=0))

def get_keras_data(df, num_cols, cat_cols):
    cols = num_cols + cat_cols
    X = {col: np.array(df[col]) for col in cols}
    return X

'''
This function trains the model
'''
def train_model(keras_model, X_t, y_train, batch_size, epochs, X_v, y_valid, fold, patience=3):
    early_stopping = EarlyStopping(patience=patience, verbose=1)
    model_checkpoint = ModelCheckpoint("model_" + str(fold) + ".hdf5",
                                       save_best_only=True, verbose=1, monitor='val_root_mean_squared_error', mode='min')

    hist = keras_model.fit(X_t, y_train, batch_size=batch_size, epochs=epochs,
                            validation_data=(X_v, y_valid), verbose=1,
                            callbacks=[early_stopping, model_checkpoint])

    keras_model = load_model("model_" + str(fold) + ".hdf5", custom_objects={'root_mean_squared_error': root_mean_squared_error})
    
    return keras_model


from sklearn.model_selection import KFold, StratifiedKFold


In [0]:
'''
This function transforms some of the date/time features in order to normalize their values as well as capturing the periodic behavior
'''
def sen_cos(df, colunas = ['day_of_year', 'week_of_year', 'month', 'day_period', 'season']):
    '''
    df: dataframe com colunas para transformar em sen cos
    colunas: colunas a serem transformadas
    return: dataframe com as colunas sen cos
    '''
    df[colunas] = df[colunas].astype('int64')/df[colunas].astype('int64').max()
    df[[col +'_sen' for col in colunas]] = df[colunas].apply(lambda x: np.sin(2*np.pi*x))
    df[[col +'_cos' for col in colunas]] = df[colunas].apply(lambda x: np.cos(2*np.pi*x))
    
    
    return df.drop(colunas, axis = 1)

In [0]:
'''
Application of the transformations over the training dataset
'''
weather_train = weather_transform(weather_train, train_dates)
print('weather_train ready')
train = regularize_build(train)
df_train = df_merge(train, weather_train, build, train_dates)
print('datasets merged')
df_train = time_columns(df_train)
print('time features created')
df_train = df_transform(df_train)
df_train = sen_cos(df_train, colunas = ['day_of_year', 'week_of_year', 'month', 'day_period', 'season'])
df_train = reduce_mem_usage(df_train, use_float16=False)
print('df_train ready')

del le, weather_train, train
gc.collect()

weather_train ready
datasets merged
time features created
Memory usage of dataframe is 5602.42 MB
Memory usage after optimization is: 2751.37 MB
Decreased by 50.9%
df_train ready


18

In [0]:
'''
Application of the transformations over the training dataset
'''
weather_test = weather_transform(weather_test, test_dates)
print('weather_test ready')
df_test = df_merge(test, weather_test, build, test_dates)
print('datasets merged')
df_test = time_columns(df_test)
print('time features created')
del weather_test, build, test
gc.collect()

weather_test ready
datasets merged
time features created


111

In [0]:
df_test = reduce_mem_usage(df_test, use_float16=False)
df_test = sen_cos(df_test, colunas = ['day_of_year', 'week_of_year', 'month', 'day_period', 'season'])
df_test = reduce_mem_usage(df_test, use_float16=False)
print('df_test ready')

Memory usage of dataframe is 8708.74 MB
Memory usage after optimization is: 3976.59 MB
Decreased by 54.3%
Memory usage of dataframe is 6919.27 MB
Memory usage after optimization is: 5328.63 MB
Decreased by 23.0%
df_test ready


**Xgboost on the entire dataset**

In this section it is attempted to predict the meters values over the entire dataset using the LightXGboost model

In [0]:
categorical_features = [
    "building_id",
    "site_id",
    "primary_use",
    "meter"
    #'cloud_coverage'
    #"cluster"
]

all_features = [col for col in df_train.columns if col not in ["timestamp", 
                                                               #"site_id", 
                                                               "meter_reading",
                                                               "log_meter_reading",
                                                               #"building_id"
                                                              ]]

In [0]:
params = {'boosting_type': 'gbdt',
          'max_depth' : 4,
          'nthread': -1,
          'num_leaves': 50,
          'learning_rate': lambda iter: 0.05 * (0.99 ** iter),
          'random_state' : 501,
          #'device_type' : 'gpu',
          'metric' : 'rmse',
          #'subsample_for_bin': 200,
          #'subsample': 1,
          #'subsample_freq': 1,
          'colsample_bytree': 0.85
          #'reg_alpha': 2,
          #'reg_lambda': 10,
          #'min_split_gain': 0.5,
          #'min_child_weight': 1,
          #'min_child_samples': 5,
          #'scale_pos_weight': 1
          #'num_class' : 1
          }

In [0]:
X_train = df_train.reset_index(drop=True)
y_train = X_train["log_meter_reading"]
y_pred_train = np.zeros(X_train.shape[0])
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
X_train, X_valid = X_train[all_features], X_test[all_features]
y_train, y_valid = y_train, y_test
del X_test, y_test
dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
dvalid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features)
print('training')
model = lgb.train(params, train_set=dtrain, 
                  num_boost_round=1000, 
                  valid_sets=[dtrain, dvalid], 
                  verbose_eval=25,
                  early_stopping_rounds=50)
print('finished')
del dtrain, dvalid

model.save_model('/content/drive/My Drive/Colab Notebooks/Kaggle/data/model_train_test.txt')

training
Training until validation scores don't improve for 50 rounds.
[25]	valid_0's rmse: 1.59995
[50]	valid_0's rmse: 1.46331
[75]	valid_0's rmse: 1.3634
[100]	valid_0's rmse: 1.27725
[125]	valid_0's rmse: 1.20958
[150]	valid_0's rmse: 1.15412
[175]	valid_0's rmse: 1.10944
[200]	valid_0's rmse: 1.07295
[225]	valid_0's rmse: 1.03631
[250]	valid_0's rmse: 0.997021
[275]	valid_0's rmse: 0.966245
[300]	valid_0's rmse: 0.936952
[325]	valid_0's rmse: 0.918862
[350]	valid_0's rmse: 0.901057
[375]	valid_0's rmse: 0.889061
[400]	valid_0's rmse: 0.879918
[425]	valid_0's rmse: 0.87179
[450]	valid_0's rmse: 0.864283
[475]	valid_0's rmse: 0.858429
[500]	valid_0's rmse: 0.852156
[525]	valid_0's rmse: 0.847663
[550]	valid_0's rmse: 0.83679
[575]	valid_0's rmse: 0.832279
[600]	valid_0's rmse: 0.823469
[625]	valid_0's rmse: 0.817131
[650]	valid_0's rmse: 0.811124
[675]	valid_0's rmse: 0.804699
[700]	valid_0's rmse: 0.799043
[725]	valid_0's rmse: 0.795052
[750]	valid_0's rmse: 0.790274
[775]	valid_0'

In [0]:
model = lgb.Booster(model_file='/content/drive/My Drive/Colab Notebooks/Kaggle/data/model_train_test.txt')
categorical_features = [
    "building_id",
    "site_id",
    "primary_use",
    "meter"
    #'cloud_coverage'
    #"cluster"
]

all_features = [col for col in df_test.columns if col not in ["timestamp", 
                                                               #"site_id", 
                                                               "meter_reading",
                                                               "log_meter_reading",
                                                               #"building_id"
                                                              ]]

In [0]:
df_pred = df_test.loc[41697600/2:]

In [0]:
results = model.predict(df_pred[all_features], num_iteration=model.best_iteration)

In [0]:
results = np.expm1(results)

In [0]:
df_results = pd.DataFrame({'row_id': df_test.row_id, 'meter_reading':np.nan})

In [0]:
df_results.loc[41697600/2:, 'meter_reading'] = results

In [0]:
df_results.sort_values('row_id').to_csv("/content/drive/My Drive/Colab Notebooks/Kaggle/data/submission_1.csv", index=False, float_format="%.5f")

In [0]:
kf = KFold(n_splits=4, random_state=501)
models = []
for train_index,valid_index in kf.split(df_train):
    X_train = df_train.reset_index(drop=True)
    y_train = X_train["log_meter_reading"]
    y_pred_train = np.zeros(X_train.shape[0])
    
    X_train, X_valid = X_train.loc[train_index, all_features], X_train.loc[valid_index, all_features]
    y_train, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
    dvalid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features)
    print('training')
    model = lgb.train(params, train_set=dtrain, 
                      num_boost_round=1000, 
                      valid_sets=[dtrain,dvalid], 
                      verbose_eval=25,
                      early_stopping_rounds=50)
    print('finished')
    models.append(model)
    del dtrain, dvalid

training
Training until validation scores don't improve for 50 rounds.
[25]	training's rmse: 1.67444	valid_1's rmse: 1.37563
[50]	training's rmse: 1.52847	valid_1's rmse: 1.27698
[75]	training's rmse: 1.41882	valid_1's rmse: 1.21098
[100]	training's rmse: 1.32697	valid_1's rmse: 1.16722
[125]	training's rmse: 1.25687	valid_1's rmse: 1.13716
[150]	training's rmse: 1.20141	valid_1's rmse: 1.1191
[175]	training's rmse: 1.15662	valid_1's rmse: 1.10333
[200]	training's rmse: 1.11939	valid_1's rmse: 1.09235
[225]	training's rmse: 1.08046	valid_1's rmse: 1.08576
[250]	training's rmse: 1.04829	valid_1's rmse: 1.07913
[275]	training's rmse: 1.02153	valid_1's rmse: 1.07224
[300]	training's rmse: 0.995892	valid_1's rmse: 1.06624
[325]	training's rmse: 0.968458	valid_1's rmse: 1.06272
[350]	training's rmse: 0.947679	valid_1's rmse: 1.06029
[375]	training's rmse: 0.936324	valid_1's rmse: 1.05869
[400]	training's rmse: 0.926915	valid_1's rmse: 1.0575
[425]	training's rmse: 0.918267	valid_1's rmse: 1

In [0]:
del build, df_train, train
gc.collect()

512

In [0]:
results = [0] * len(df_test)
for model in models:
    results = results + np.expm1(model.predict(df_test[all_features], num_iteration=model.best_iteration)) / len(models)
    print('Terminou modelo {}'.format(model))
    del model

In [0]:
results_df = pd.DataFrame({"row_id": X.index, "meter_reading": np.clip(results, 0, a_max=None)})
len(results_df)
results_df.to_csv("/content/drive/My Drive/Colab Notebooks/Kaggle/data/submission_1.csv", index=False, float_format="%.5f")
del results 

**Neural Network over the entire dataset**

In this section the neural network is used in an attempt to predict the meters' values over the entire dataset

In [0]:
'''
The remaining code is used for the actual training of the network. It is based on a standard procedure for training and predictions using randomized
parts of the original dataset
'''
oof = np.zeros(len(train))
batch_size = 1024
epochs = 100
models = []

folds = 4
seed = 666

kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

for fold_n, (train_index, valid_index) in enumerate(kf.split(train, train['building_id'])):
    print('Fold:', fold_n)
    X_train, X_valid = df_train[all_features].iloc[train_index], df_train[all_features].iloc[valid_index]
    y_train, y_valid = df_train['log_meter_reading'].iloc[train_index], df_train['log_meter_reading'].iloc[valid_index]
    X_t = get_keras_data(X_train, numericals, categoricals)
    X_v = get_keras_data(X_valid, numericals, categoricals)
    
    keras_model = model(dense_dim_1=128, dense_dim_2=64, dense_dim_3=32, dense_dim_4=32, dense_dim_5=16, 
                        dropout1=0.2, dropout2=0.1, dropout3=0.1, dropout4=0.1, lr=0.01)
    mod = train_model(keras_model, X_t, y_train, batch_size, epochs, X_v, y_valid, fold_n, patience=3)
    models.append(mod)
    print('*'* 50)

from tqdm import tqdm
i=0
res = np.zeros((test.shape[0]),dtype=np.float32)
step_size = 50000
for j in tqdm(range(int(np.ceil(test.shape[0]/step_size)))):
    for_prediction = get_keras_data(test.iloc[i:i+step_size], numericals, categoricals)
    res[i:min(i+step_size,test.shape[0])] = \
       np.expm1(sum([model.predict(for_prediction, batch_size=1024)[:,0] for model in models])/folds)
    i+=step_size

**Xgbook per site**

In this section it is attempted to predict the meter values separatedly for each site ID

In [0]:
cv = 2
models = {}
cv_scores = {"site_id": [], "cv_score": []}

for site_id in tqdm(range(16), desc="site_id"):
    print(cv, "fold CV for site_id:", site_id)
    kf = KFold(n_splits=cv, random_state=seed)
    models[site_id] = []

    X_train_site = df_train[df_train["site_id"]==site_id].reset_index(drop=True)
    y_train_site = X_train_site["log_meter_reading"]
    y_pred_train_site = np.zeros(X_train_site.shape[0])
    
    score = 0

    for fold, (train_index, valid_index) in enumerate(kf.split(X_train_site, y_train_site)):
        X_train, X_valid = X_train_site.loc[train_index, all_features], X_train_site.loc[valid_index, all_features]
        y_train, y_valid = y_train_site.iloc[train_index], y_train_site.iloc[valid_index]

        dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
        dvalid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features)

        watchlist = [dtrain, dvalid]

        params = {"objective": "regression",
                  "num_leaves": 41,
                  "learning_rate": 0.049,
                  "bagging_freq": 5,
                  "bagging_fraction": 0.51,
                  "feature_fraction": 0.81,
                  "metric": "rmse"
                  }

        model_lgb = lgb.train(params, train_set=dtrain, 
                              num_boost_round=999, 
                              valid_sets=watchlist, 
                              verbose_eval=101, 
                              early_stopping_rounds=50)
        models[site_id].append(model_lgb)

        y_pred_valid = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
        y_pred_train_site[valid_index] = y_pred_valid

        rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
        print("Site Id:", site_id, ", Fold:", fold+1, ", RMSE:", rmse)
        score += rmse / cv
        
        gc.collect()
        
    cv_scores["site_id"].append(site_id)
    cv_scores["cv_score"].append(score)
        
    print("\nSite Id:", site_id, ", CV RMSE:", np.sqrt(mean_squared_error(y_train_site, y_pred_train_site)), "\n")

In [0]:
df_test_sites = []

for site_id in tqdm(range(16), desc="site_id"):
    print("Preparing test data for site_id", site_id)

    X_test_site = df_test[df_test.site_id==site_id][all_features]
    
    row_ids_site = X_test_site.row_id

    X_test_site = X_test_site[all_features]
    y_pred_test_site = np.zeros(X_test_site.shape[0])

    print("Scoring for site_id", site_id)    
    for fold in range(cv):
        model_lgb = models[site_id][fold]
        y_pred_test_site += model_lgb.predict(X_test_site, num_iteration=model_lgb.best_iteration) / cv
        gc.collect()
        
    df_test_site = pd.DataFrame({"row_id": row_ids_site, "meter_reading": y_pred_test_site})
    df_test_sites.append(df_test_site)
    
    print("Scoring for site_id", site_id, "completed\n")
    gc.collect()

In [0]:
submit = pd.concat(df_test_sites)
submit.meter_reading = np.clip(np.expm1(submit.meter_reading), 0, a_max=None)
submit.to_csv("submission_diogo_10_modelos_siteid.csv", index=False)