## Import Packages

In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc
import pickle
import seaborn as sns

DATA_PATH = "../ASHRAE/input/"
MODEL_NAME = 'catboost'

  import pandas.util.testing as tm


## Utility Functions

In [2]:

def add_lag_feature(weather_df, window=3):
    group_df = weather_df.groupby('site_id')
    cols = ['air_temperature', 'cloud_coverage', 'dew_temperature']
    rolled = group_df[cols].rolling(window=window, min_periods=0)
    lag_mean = rolled.mean().reset_index().astype(np.float16)
    lag_max = rolled.max().reset_index().astype(np.float16)
    lag_min = rolled.min().reset_index().astype(np.float16)
    lag_std = rolled.std().reset_index().astype(np.float16)
    for col in cols:
        weather_df[f'{col}_mean_lag{window}'] = lag_mean[col]
        weather_df[f'{col}_max_lag{window}'] = lag_max[col]
        weather_df[f'{col}_min_lag{window}'] = lag_min[col]
        weather_df[f'{col}_std_lag{window}'] = lag_std[col]
        

# Original code from https://www.kaggle.com/aitude/ashrae-missing-weather-data-handling by @aitude
def fill_weather_dataset(weather_df):
    
    # Find Missing Dates
    weather_df['timestamp'] = weather_df['timestamp'].astype('str')
    time_format = "%Y-%m-%d %H:%M:%S"
    start_date = datetime.datetime.strptime(weather_df['timestamp'].min(),time_format)
    end_date = datetime.datetime.strptime(weather_df['timestamp'].max(),time_format)
    total_hours = int(((end_date - start_date).total_seconds() + 3600) / 3600)
    hours_list = [(end_date - datetime.timedelta(hours=x)).strftime(time_format) for x in range(total_hours)]

    missing_hours = []
    for site_id in range(16):
        site_hours = np.array(weather_df[weather_df['site_id'] == site_id]['timestamp'])
        new_rows = pd.DataFrame(np.setdiff1d(hours_list,site_hours),columns=['timestamp'])
        new_rows['site_id'] = site_id
        weather_df = pd.concat([weather_df,new_rows])

        weather_df = weather_df.reset_index(drop=True)           

    # Add new Features
    weather_df["datetime"] = pd.to_datetime(weather_df["timestamp"])
    weather_df["day"] = weather_df["datetime"].dt.day
    weather_df["week"] = weather_df["datetime"].dt.week
    weather_df["month"] = weather_df["datetime"].dt.month
    
    # Reset Index for Fast Update
    weather_df = weather_df.set_index(['site_id','day','month'])

    air_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
    weather_df.update(air_temperature_filler,overwrite=False)

    # Step 1
    cloud_coverage_filler = weather_df.groupby(['site_id','day','month'])['cloud_coverage'].mean()
    # Step 2
    cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method='ffill'),columns=["cloud_coverage"])

    weather_df.update(cloud_coverage_filler,overwrite=False)

    due_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
    weather_df.update(due_temperature_filler,overwrite=False)

    # Step 1
    sea_level_filler = weather_df.groupby(['site_id','day','month'])['sea_level_pressure'].mean()
    # Step 2
    sea_level_filler = pd.DataFrame(sea_level_filler.fillna(method='ffill'),columns=['sea_level_pressure'])

    weather_df.update(sea_level_filler,overwrite=False)

    wind_direction_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_direction'].mean(),columns=['wind_direction'])
    weather_df.update(wind_direction_filler,overwrite=False)

    wind_speed_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_speed'].mean(),columns=['wind_speed'])
    weather_df.update(wind_speed_filler,overwrite=False)

    # Step 1
    precip_depth_filler = weather_df.groupby(['site_id','day','month'])['precip_depth_1_hr'].mean()
    # Step 2
    precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method='ffill'),columns=['precip_depth_1_hr'])

    weather_df.update(precip_depth_filler,overwrite=False)

    weather_df = weather_df.reset_index()
    weather_df = weather_df.drop(['datetime','day','week','month'],axis=1)
    
    # Add lags
    add_lag_feature(weather_df, window=3)
#     add_lag_feature(weather_df, window=72)

    return weather_df

# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def features_engineering(df):
    
    # Sort by timestamp
#     df = df.sort_values("timestamp")
#     df = df.reset_index(drop=True)
    
    df.sort_values("timestamp")
    df.reset_index(drop=True)
    
    # Add more features
    df["timestamp"] = pd.to_datetime(df["timestamp"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["timestamp"].dt.hour
    df["dayofweek"] = df["timestamp"].dt.dayofweek
    df['square_feet'] =  np.log1p(df['square_feet'])
    
    holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
                "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
                "2017-01-01", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
                "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
                "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
                "2019-01-01"]
    df["is_holiday"] = (df.timestamp.dt.date.astype("str").isin(holidays)).astype(int)
    
    
    #df['meter_reading_log1p'] = np.log1p(df['meter_reading'])
    
#     #Buildings statistics
#     df_group = df.groupby(['building_id', 'meter'])['meter_reading_log1p']
#     building_mean = df_group.mean().astype(np.float16)
#     building_median = df_group.median().astype(np.float16)
#     building_min = df_group.min().astype(np.float16)
#     building_max = df_group.max().astype(np.float16)
#     building_std = df_group.std().astype(np.float16)
    
#     building_stats_df = pd.concat([building_mean, building_median, building_min, building_max, building_std], axis=1,
#                               keys=['building_mean', 'building_median', 'building_min', 'building_max', 'building_std']).reset_index()
#     df = pd.merge(df, building_stats_df, on=['building_id', 'meter'], how='left', copy=False)
    
       
    # Remove Unused Columns
    drop = ["timestamp","sea_level_pressure", "wind_direction", "wind_speed","year_built","floor_count"]
    df = df.drop(drop, axis=1)
    gc.collect()
    
    # Encode Categorical Data
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
        
    
    return df

In [3]:
def save_model_settings_pickle(ms, model_settings_filename):

    model_save = open(model_settings_filename, 'wb')
    pickle.dump(ms, model_save)
    model_save.close()
    print('Model settings are saved in ' + model_settings_filename)

    return None


def load_model_settings_pickle(model_settings_filename):

    model_load = open(model_settings_filename, 'rb')
    ms = pickle.load(model_load)
    print('Model settings are uploaded from ' + model_settings_filename)

    return ms

## Load Data

In [4]:
train_df = pd.read_csv(DATA_PATH + 'train.csv')

# Remove outliers
train_df = train_df [ train_df['building_id'] != 1099 ]
train_df = train_df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

building_df = pd.read_csv(DATA_PATH + 'building_metadata.csv')

weather_train_df = pd.read_csv(DATA_PATH + 'weather_train.csv')
weather_train_df["timestamp"] = pd.to_datetime(weather_train_df["timestamp"])

## Timestamp alignment

In [5]:
# https://www.kaggle.com/nz0722/aligned-timestamp-lgbm-by-meter-type

weather_test_df = pd.read_csv(DATA_PATH+'weather_test.csv')
weather_test_df["timestamp"] = pd.to_datetime(weather_test_df["timestamp"])
weather = pd.concat([weather_train_df, weather_test_df],ignore_index=True)

# weather["timestamp"] = pd.to_datetime(weather["timestamp"])
del weather_test_df

weather_key = ['site_id', 'timestamp']

temp_skeleton = weather[weather_key + ['air_temperature']].drop_duplicates(subset=weather_key).sort_values(by=weather_key).copy()

# calculate ranks of hourly temperatures within date/site_id chunks

temp_skeleton['temp_rank'] = temp_skeleton.groupby(['site_id', temp_skeleton.timestamp.dt.date])['air_temperature'].rank('average')

# create a dataframe of site_ids (0-16) x mean hour rank of temperature within day (0-23)
df_2d = temp_skeleton.groupby(['site_id', temp_skeleton.timestamp.dt.hour])['temp_rank'].mean().unstack(level=1)

# Subtract the columnID of temperature peak by 14, getting the timestamp alignment gap.
site_ids_offsets = pd.Series(df_2d.values.argmax(axis=1) - 14)
site_ids_offsets.index.name = 'site_id'

def timestamp_align(df):
    df['offset'] = df.site_id.map(site_ids_offsets)
    df['timestamp_aligned'] = (df.timestamp - pd.to_timedelta(df.offset, unit='H'))
    df['timestamp'] = df['timestamp_aligned']
    del df['timestamp_aligned']
    return df

del weather
del temp_skeleton
gc.collect()

weather_train_df = timestamp_align(weather_train_df)

## Fill Weather Information

I'm using [this kernel](https://www.kaggle.com/aitude/ashrae-missing-weather-data-handling) to handle missing weather information.

In [6]:
weather_train_df = fill_weather_dataset(weather_train_df)

## Memory Reduction

In [7]:
train_df = reduce_mem_usage(train_df,use_float16=True)
building_df = reduce_mem_usage(building_df,use_float16=True)
weather_train_df = reduce_mem_usage(weather_train_df,use_float16=True)

Memory usage of dataframe is 757.31 MB
Memory usage after optimization is: 322.24 MB
Decreased by 57.4%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.8%
Memory usage of dataframe is 13.95 MB
Memory usage after optimization is: 6.15 MB
Decreased by 55.9%


In [8]:
weather_train_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,offset,air_temperature_mean_lag3,air_temperature_max_lag3,air_temperature_min_lag3,air_temperature_std_lag3,cloud_coverage_mean_lag3,cloud_coverage_max_lag3,cloud_coverage_min_lag3,cloud_coverage_std_lag3,dew_temperature_mean_lag3,dew_temperature_max_lag3,dew_temperature_min_lag3,dew_temperature_std_lag3
0,0,2015-12-31 19:00:00,25.0,6.0,20.0,-0.086975,1019.5,0.0,0.0,5.0,25.0,25.0,25.0,,6.0,6.0,6.0,,20.0,20.0,20.0,
1,0,2015-12-31 20:00:00,24.40625,2.705078,21.09375,-1.0,1020.0,70.0,1.5,5.0,24.703125,25.0,24.40625,0.424316,4.351562,6.0,2.705078,2.330078,20.546875,21.09375,20.0,0.777832
2,0,2015-12-31 21:00:00,22.796875,2.0,21.09375,0.0,1020.0,0.0,0.0,5.0,24.0625,25.0,22.796875,1.137695,3.568359,6.0,2.0,2.134766,20.734375,21.09375,20.0,0.635254
3,0,2015-12-31 22:00:00,21.09375,2.0,20.59375,0.0,1020.0,0.0,0.0,5.0,22.765625,24.40625,21.09375,1.650391,2.234375,2.705078,2.0,0.4074707,20.9375,21.09375,20.59375,0.288574
4,0,2015-12-31 23:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.599609,5.0,21.296875,22.796875,20.0,1.411133,2.0,2.0,2.0,5.960464e-08,20.5625,21.09375,20.0,0.550781


## Merge Data

We need to add building and weather information into training dataset.

In [9]:
train_df = train_df.merge(building_df, left_on='building_id',right_on='building_id',how='left')
train_df = train_df.merge(weather_train_df,how='left',left_on=['site_id','timestamp'],right_on=['site_id','timestamp'])
del weather_train_df
gc.collect()

0

## Features Engineering

In [10]:
train_df = features_engineering(train_df)

In [11]:
train_df.head(20)

Unnamed: 0,building_id,meter,meter_reading,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,offset,air_temperature_mean_lag3,air_temperature_max_lag3,air_temperature_min_lag3,air_temperature_std_lag3,cloud_coverage_mean_lag3,cloud_coverage_max_lag3,cloud_coverage_min_lag3,cloud_coverage_std_lag3,dew_temperature_mean_lag3,dew_temperature_max_lag3,dew_temperature_min_lag3,dew_temperature_std_lag3,hour,dayofweek,is_holiday
0,105,0,23.3036,1,0,10.832181,3.800781,0.0,2.400391,-0.086975,0.0,19.5625,22.796875,15.304688,3.853516,2.705078,2.705078,2.705078,0.0,9.21875,10.0,7.667969,1.34668,0,4,1
1,106,0,0.3746,1,0,8.589514,3.800781,0.0,2.400391,-0.086975,0.0,19.5625,22.796875,15.304688,3.853516,2.705078,2.705078,2.705078,0.0,9.21875,10.0,7.667969,1.34668,0,4,1
2,106,3,0.0,1,0,8.589514,3.800781,0.0,2.400391,-0.086975,0.0,19.5625,22.796875,15.304688,3.853516,2.705078,2.705078,2.705078,0.0,9.21875,10.0,7.667969,1.34668,0,4,1
3,107,0,175.184006,1,0,11.487946,3.800781,0.0,2.400391,-0.086975,0.0,19.5625,22.796875,15.304688,3.853516,2.705078,2.705078,2.705078,0.0,9.21875,10.0,7.667969,1.34668,0,4,1
4,108,0,91.265297,1,0,11.309352,3.800781,0.0,2.400391,-0.086975,0.0,19.5625,22.796875,15.304688,3.853516,2.705078,2.705078,2.705078,0.0,9.21875,10.0,7.667969,1.34668,0,4,1
5,109,0,80.93,1,0,10.950736,3.800781,0.0,2.400391,-0.086975,0.0,19.5625,22.796875,15.304688,3.853516,2.705078,2.705078,2.705078,0.0,9.21875,10.0,7.667969,1.34668,0,4,1
6,109,3,0.0,1,0,10.950736,3.800781,0.0,2.400391,-0.086975,0.0,19.5625,22.796875,15.304688,3.853516,2.705078,2.705078,2.705078,0.0,9.21875,10.0,7.667969,1.34668,0,4,1
7,110,0,86.228302,1,0,10.233331,3.800781,0.0,2.400391,-0.086975,0.0,19.5625,22.796875,15.304688,3.853516,2.705078,2.705078,2.705078,0.0,9.21875,10.0,7.667969,1.34668,0,4,1
8,111,0,167.391998,1,0,11.681309,3.800781,0.0,2.400391,-0.086975,0.0,19.5625,22.796875,15.304688,3.853516,2.705078,2.705078,2.705078,0.0,9.21875,10.0,7.667969,1.34668,0,4,1
9,112,0,10.2748,1,0,10.379939,3.800781,0.0,2.400391,-0.086975,0.0,19.5625,22.796875,15.304688,3.853516,2.705078,2.705078,2.705078,0.0,9.21875,10.0,7.667969,1.34668,0,4,1


## Features & Target Variables

In [12]:
target = np.log1p(train_df['meter_reading']/train_df['square_feet'])
features = train_df.drop(['meter_reading'], axis = 1)
del train_df
gc.collect()

53

In [13]:
import optuna
from optuna import Trial

category_cols = ["building_id", "site_id", "meter", "primary_use", "dayofweek", 'is_holiday']

def objective(trial: Trial, model_name = 'lgb', fast_check=False, target_meter=0, return_info=False):
    folds = 3
    seed = 666
    shuffle = False
    kf = KFold(n_splits=folds, shuffle=shuffle, random_state=seed)

#     X_train, y_train = create_X_y(train_df, target_meter=target_meter)
    X_train, y_train = features, target
    y_valid_pred_total = np.zeros(X_train.shape[0])
    gc.collect()
#     print('target_meter', target_meter, X_train.shape)

    cat_features = [X_train.columns.get_loc(cat_col) for cat_col in category_cols]

    models = []
    valid_score = 0
    for train_idx, valid_idx in kf.split(X_train, y_train):
        
        train_data = X_train.iloc[train_idx,:], y_train[train_idx]
        valid_data = X_train.iloc[valid_idx,:], y_train[valid_idx]

        print('train', len(train_idx), 'valid', len(valid_idx))
#         model, y_pred_valid, log = fit_cb(train_data, valid_data, cat_features=cat_features, devices=[0,])
        
        if model_name == 'lgb':
            # LightGBM
            model, y_pred_valid, log = fit_lgbm(trial, train_data, valid_data, cat_features=category_cols,
                                            num_rounds=1000)
        elif model_name == 'xgb':
            # XGBOOST
            model, y_pred_valid, log = fit_xgb(trial, train_data, valid_data, num_rounds=150)
            
        else:
            model, y_pred_valid, log = fit_catboost(trial, train_data, valid_data, cat_features=category_cols,
                                            num_rounds=1000)
       
        y_valid_pred_total[valid_idx] = y_pred_valid
        models.append(model)
        gc.collect()
        valid_score += log["valid/rmse"]
        
        if fast_check:
            break
            
    save_model_settings_pickle(models, DATA_PATH + model_name + '_models.pickle')
    
    valid_score /= len(models)
    if return_info:
        return valid_score, models, y_pred_valid, y_train
    else:
        return valid_score

In [14]:
def fit_catboost(trial, train, val, devices=(-1,), seed=None, cat_features=None, num_rounds=1500):
    """Train Catboost model"""
    X_train, y_train = train
    X_valid, y_valid = val
    
    params = {
#         'num_leaves': trial.suggest_int('num_leaves', 2, 64),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-2, 1e-1),
        "subsample": trial.suggest_uniform('subsample', 0.4, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 9)
        
    }
    
    device = devices[0]
    if device == -1:
        # use cpu
        pass
    else:
        # use gpu
        print(f'using gpu device_id {device}...')
        params.update({'device': 'gpu', 'gpu_device_id': device})

    params['seed'] = seed

    early_stop = 20
    verbose_eval = 25

    d_train = cb.Pool(X_train, label=y_train, cat_features=cat_features)
    d_valid = cb.Pool(X_valid, label=y_valid, cat_features=cat_features)
    
    model = cb.CatBoostRegressor(iterations=num_rounds,
                                learning_rate=params['learning_rate'],
                                subsample=params["subsample"],
                                max_depth=params['max_depth']
                                )
                                 
    print('training Catboost:')
    model.fit(d_train,
              eval_set=d_valid,
              verbose=verbose_eval,
              early_stopping_rounds=early_stop)

    # predictions
    y_pred_valid = model.predict(X_valid)
    
    print('best_score', model.best_score_)
    log = {'train/rmse': model.best_score_['d_train']['rmse'],
           'valid/rmse': model.best_score_['d_valid']['rmse']}
    return model, y_pred_valid, log

In [15]:
def fit_lgbm(trial, train, val, devices=(-1,), seed=None, cat_features=None, num_rounds=1500):
    """Train Light GBM model"""
    X_train, y_train = train
    X_valid, y_valid = val
    
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'objective': 'regression',
#               'max_depth': -1,
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-2, 1e-1),
        "boosting": "gbdt",
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        "bagging_freq": 5,
        "bagging_fraction": trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        "feature_fraction": trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        "metric": 'rmse',
        "verbosity": -1,
    }
    
    device = devices[0]
    if device == -1:
        # use cpu
        pass
    else:
        # use gpu
        print(f'using gpu device_id {device}...')
        params.update({'device': 'gpu', 'gpu_device_id': device})

    params['seed'] = seed

    early_stop = 20
    verbose_eval = 25

    d_train = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_features)
    d_valid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_features)
    watchlist = [d_train, d_valid]

    print('training LGB:')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)

    # predictions
    y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
    
    print('best_score', model.best_score)
    log = {'train/rmse': model.best_score['training']['rmse'],
           'valid/rmse': model.best_score['valid_1']['rmse']}
    return model, y_pred_valid, log

In [16]:
def fit_xgb(trial, train, val, seed=None, num_rounds=100):
    """Train Light GBM model"""
    X_train, y_train = train
    X_valid, y_valid = val
    
    params = {
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.03, 0.3),
        'subsample': trial.suggest_uniform('bagging_fraction', 0.1, 0.8),
        'colsample_bytree': trial.suggest_uniform('feature_fraction', 0.4, 0.8),
#         'max_depth': trial.suggest_int('max_depth', 1, 9),
        'eval_metric':'rmse'
    }
    
#     params['seed'] = seed

    early_stop = 20
    verbose_eval = 25
       
    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_valid, label=y_valid)

    watchlist = [(d_train,'train'), (d_valid,'valid')]

    print('training XGB:')
    model = xgb.train(params,
                      d_train,
                      evals=watchlist,
                      num_boost_round=num_rounds,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)

    # predictions
#     best_iteration = model.get_booster().best_ntree_limit
#     y_pred_valid = model.predict(X_valid, ntree_limit=best_iteration)
    y_pred_valid = model.predict(d_valid, ntree_limit=model.best_ntree_limit)
    
    print('best_iteration', model.best_iteration)
    log = {'valid/rmse': model.best_score}
    return model, y_pred_valid, log

In [None]:
n_trials = 5
study = optuna.create_study()
study.optimize(lambda trial: objective(trial, model_name = MODEL_NAME), n_trials=n_trials)
# study.optimize(objective, n_trials=n_trials)


Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.



train 13234948 valid 6617474
training Catboost:
0:	learn: 1.4763636	test: 1.4907677	best: 1.4907677 (0)	total: 4.23s	remaining: 1h 10m 22s
25:	learn: 1.0850947	test: 1.1575232	best: 1.1575232 (25)	total: 54.1s	remaining: 33m 46s
50:	learn: 0.9730186	test: 1.0748255	best: 1.0748255 (50)	total: 1m 50s	remaining: 34m 22s


In [None]:
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

In [None]:
trials_df = study.trials_dataframe()
trials_df

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
valid_score, models, y_pred_valid, y_train = objective(optuna.trial.FixedTrial(study.best_params), model_name = MODEL_NAME, fast_check=False, target_meter=0, return_info=True)

v_score = np.sqrt(mean_squared_error(y_pred_valid, y_train))
print ('v_score=', v_score)

sns.distplot(y_pred_valid, label='pred')
sns.distplot(y_train, label='ground truth')
plt.legend()
plt.show()

In [None]:
# del train_df
# del weather_train_df
# gc.collect()

## Load Test Data

In [None]:
test_df = pd.read_csv(DATA_PATH + 'test.csv')
# row_ids = test_df["row_id"]
test_df.drop("row_id", axis=1, inplace=True)
test_df = reduce_mem_usage(test_df)

## Merge Building Data

In [None]:
test_df = test_df.merge(building_df,left_on='building_id',right_on='building_id',how='left')
del building_df
gc.collect()

## Fill Weather Information

In [None]:
weather_test_df = pd.read_csv(DATA_PATH + 'weather_test.csv')
weather_test_df["timestamp"] = pd.to_datetime(weather_test_df["timestamp"])
weather_test_df = timestamp_align(weather_test_df)
weather_test_df = fill_weather_dataset(weather_test_df)
weather_test_df = reduce_mem_usage(weather_test_df)


## Merge Weather Data

In [None]:
test_df = test_df.merge(weather_test_df,how='left',on=['timestamp','site_id'])
del weather_test_df
gc.collect()

## Features Engineering

In [None]:
test_df = features_engineering(test_df)

In [None]:
test_df.head(20)

## Prediction

In [None]:
models = load_model_settings_pickle(DATA_PATH + MODEL_NAME + '_models.pickle')

In [None]:
if MODEL_NAME == 'lgb' or MODEL_NAME == 'catboost':
    results = []
    for model in models:
        if  results == []:
            results = (np.expm1(model.predict(test_df, num_iteration=model.best_iteration))*test_df['square_feet'].values) / len(models)
        else:
            results += (np.expm1(model.predict(test_df, num_iteration=model.best_iteration))*test_df['square_feet'].values) / len(models)
        del model
        gc.collect()

if MODEL_NAME == 'xgb':    
    d_test = xgb.DMatrix(test_df)
    results = []
    for model in models:
        if  results == []:
            results = (np.expm1(model.predict(d_test, ntree_limit = model.best_ntree_limit))*test_df['square_feet'].values) / len(models)
        else:
            results += (np.expm1(model.predict(d_test, ntree_limit = model.best_ntree_limit))*test_df['square_feet'].values) / len(models)
        del model
        gc.collect()

In [None]:
del test_df, models
gc.collect()

## Submission

In [None]:
sample_submission = pd.read_csv(DATA_PATH + 'sample_submission.csv')
reduce_mem_usage(sample_submission)

sample_submission['meter_reading'] = np.clip(results, 0, a_max=None)
sample_submission.to_csv('submission.csv', index=False, float_format='%.4f')

# results_df = pd.DataFrame({"row_id": row_ids, "meter_reading": np.clip(results, 0, a_max=None)})
# del row_ids,results
# gc.collect()
# results_df.to_csv("submission.csv", index=False)
# results_df.head(20)