In [2]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import helpers

In [None]:
data_dir = "data/"

# Read in the .csv files as Pandas DataFrame
building = pd.read_csv(data_dir + 'building_metadata.csv')
train = pd.read_csv(data_dir + 'train.csv')
weather_train = pd.read_csv(data_dir + 'weather_train.csv')

In [None]:
train = train.merge(building, left_on = "building_id", right_on = "building_id", how = "left")
train = train.merge(weather_train, left_on = ["site_id", "timestamp"], right_on = ["site_id", "timestamp"], how = "left")


del weather_train


train["timestamp"] = pd.to_datetime(train["timestamp"])
train["hour"] = train["timestamp"].dt.hour
train["day"] = train["timestamp"].dt.day
train["weekend"] = train["timestamp"].dt.weekday
train["month"] = train["timestamp"].dt.month

train = train.drop("timestamp", axis = 1)


le = LabelEncoder()
train["primary_use"] = le.fit_transform(train["primary_use"])

train = train.drop(["precip_depth_1_hr", "sea_level_pressure", "wind_direction", "wind_speed","site_id", "floor_count"],axis=1)

train.head()

In [None]:
target = np.log1p(train["meter_reading"])
target


train = train.drop(["meter_reading"],axis=1)


train, NAlist = reduce_mem_usage(train)

In [None]:
data = ["building_id", "primary_use", "hour", "day", "weekend", "month", "meter","square_feet", "year_built", "air_temperature", "cloud_coverage","dew_temperature"]
num_folds = 3
kf = KFold(n_splits = num_folds, shuffle = False)
error = 0
models = []
evals_results = []
for i, (train_index, test_index) in enumerate(kf.split(train)):
    if i + 1 < num_folds:
        continue
    print(train_index.max(), test_index.min())
    train_X = train[data].iloc[train_index]
    test_X = train[data].iloc[test_index]
    train_y = target.iloc[train_index]
    test_y = target.iloc[test_index]
    
    lgb_train = lgb.Dataset(train_X[train_y > 0], train_y[train_y > 0])
    lgb_test = lgb.Dataset(test_X[test_y > 0] , test_y[test_y > 0])
    evals_result = {}
    params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'learning_rate': 0.6,
            'feature_fraction': 0.7,
            'bagging_fraction': 0.7,
            'bagging_freq' : 4
            }
    model = lgb.train(params,
                lgb_train,
                num_boost_round=2000,
                valid_sets=(lgb_train, lgb_test),
               early_stopping_rounds=30,
               verbose_eval = 25,
               evals_result = evals_result
                           )
    models.append(model)
    evals_results.append(evals_result)

In [None]:
for model, evals_result in zip(models, evals_results):
    f, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize=(15, 6))
    lgb.plot_importance(model, ax=ax1)
    lgb.plot_metric(evals_result, metric='rmse', ax=ax2)

plt.show()

In [None]:
sorted(zip(model.feature_importance(), model.feature_name()),reverse = True)