In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

import gc
from sklearn.model_selection import TimeSeriesSplit

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train_df = pd.read_csv('train_final.csv')
test_df = pd.read_csv('test_final.csv')

In [3]:
# The two column with null values do not have a high feature importance, so try dropping them

col_to_drop = ['year_built', 'floor_count']

train = train_df.drop(columns = col_to_drop)
test = test_df.drop(columns = col_to_drop)

In [4]:
target = np.log1p(train["meter_reading"])
features = train.drop('meter_reading', axis = 1)
del train_df
gc.collect()

47

In [5]:
nsplits = 3
folds = TimeSeriesSplit(nsplits)

In [13]:
params = {"boosting": "gbdt",
          'application':'regression',
          'num_boost_round':200, 
          'early_stopping_round':50, 
          'metric':'rmse',
          'num_leaves': 1000,
          'feature_fraction': 0.887,
          'bagging_fraction': 0.942,
          'bagging_freq':11,
          'max_depth': 12,
          'lambda_l1': 3.917,
          'lambda_l2': 2.302,
          'min_split_gain': 0.08753,
          'min_child_weight': 5.54,
          'learning_rate' : 0.066}

In [14]:
categorical_features = ["building_id", "site_id", "meter", "primary_use", "day", "month", "hour","weekday", "is_holiday", "weekend"]
    
X_train = features
y_train = target

pred = np.zeros(len(test_df))

df_fimp = pd.DataFrame()
for tr_idx,va_idx in folds.split(X_train,y_train):
    X_half_1 = X_train.iloc[tr_idx]
    y_half_1 = y_train.iloc[tr_idx]
    X_half_2 = X_train.iloc[va_idx]
    y_half_2 = y_train.iloc[va_idx]

    d_half_1 = lgb.Dataset(X_half_1, label=y_half_1, categorical_feature=categorical_features, free_raw_data=False)
    d_half_2 = lgb.Dataset(X_half_2, label=y_half_2, categorical_feature=categorical_features, free_raw_data=False)

    print("Building model with first half and validating on second half:")
    model_half = lgb.train(params, train_set=d_half_1, valid_sets=[d_half_1,d_half_2], verbose_eval=1000)
    gc.collect()
    pred += np.expm1(model_half.predict(test_df, num_iteration=model_half.best_iteration)) / nsplits
    
    df_fimp_1 = pd.DataFrame()
    df_fimp_1["feature"] = X_train.columns.values
    df_fimp_1["importance"] = model_half.feature_importance()
    
    df_fimp = pd.concat([df_fimp, df_fimp_1], axis=0)
    del model_half
    gc.collect()

Building model with first half and validating on second half:
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[200]	training's rmse: 0.469677	valid_1's rmse: 1.13551
Building model with first half and validating on second half:
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[65]	training's rmse: 0.673347	valid_1's rmse: 0.961749
Building model with first half and validating on second half:
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[125]	training's rmse: 0.611574	valid_1's rmse: 0.90993


In [15]:
pred

array([316.14936724, 194.85549608, 266.8343144 , ..., 164.88024808,
       164.88024808, 164.88024808])

In [18]:
submission = pd.read_csv('../../Ashrae/data/sample_submission.csv')

In [19]:
submission.head()

Unnamed: 0,row_id,meter_reading
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [20]:
submission['meter_reading'] = pred

In [21]:
submission.head()

Unnamed: 0,row_id,meter_reading
0,0,316.149367
1,1,194.855496
2,2,266.834314
3,3,251.710295
4,4,200.45948


In [22]:
submission.to_csv('submission5.csv', index = False)

Score: 2.65 (bad parameters tuning)