In [4]:
import pandas as pd
import numpy as np
import datetime
from meteocalc import feels_like, Temp
import gc
from sklearn.preprocessing import LabelEncoder


In [5]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [6]:
train_df = import_data('train.csv')
building_df = import_data('building_metadata.csv')
weather_df = import_data('weather_train.csv')

Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 173.90 MB
Decreased by 71.8%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.8%
Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 2.65 MB
Decreased by 72.4%


In [7]:
train_df = train_df [ train_df['building_id'] != 1099 ]
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
train_df = train_df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

In [8]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
103,105,0,2016-01-01,23.3036
104,106,0,2016-01-01,0.3746
105,106,3,2016-01-01,0.0
106,107,0,2016-01-01,175.184006
107,108,0,2016-01-01,91.265297


In [9]:
def fill_weather_dataset(weather_df):
    
    weather_df["datetime"] = pd.to_datetime(weather_df["timestamp"])
    weather_df["day"] = weather_df["datetime"].dt.day
    weather_df["week"] = weather_df["datetime"].dt.week
    weather_df["month"] = weather_df["datetime"].dt.month
    weather_df = weather_df.set_index(['site_id','month'])
    
    return weather_df
    
weather_df = fill_weather_dataset(weather_df)

In [10]:
weather_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,datetime,day,week
site_id,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.5,0.0,0.0,2016-01-01 00:00:00,1,53
0,1,2016-01-01 01:00:00,24.40625,,21.09375,-1.0,1020.0,70.0,1.5,2016-01-01 01:00:00,1,53
0,1,2016-01-01 02:00:00,22.796875,2.0,21.09375,0.0,1020.0,0.0,0.0,2016-01-01 02:00:00,1,53
0,1,2016-01-01 03:00:00,21.09375,2.0,20.59375,0.0,1020.0,0.0,0.0,2016-01-01 03:00:00,1,53
0,1,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.599609,2016-01-01 04:00:00,1,53


In [11]:
percip = weather_df.groupby(['site_id','month'])[['precip_depth_1_hr']].mean()
cloud = weather_df.groupby(['site_id','month'])[['cloud_coverage']].mean()
airtemp = weather_df.groupby(['site_id','month'])[['air_temperature']].mean()
dewtemp = weather_df.groupby(['site_id','month'])[['dew_temperature']].mean()
sealevelp = weather_df.groupby(['site_id','month'])[['sea_level_pressure']].mean()
windd = weather_df.groupby(['site_id','month'])[['wind_direction']].mean()
winds = weather_df.groupby(['site_id','month'])[['wind_speed']].mean()

In [12]:
pd.set_option('display.max_rows', None)
# percip
# cloud
# airtemp
# dewtemp
# sealevelp
# windd
# winds

In [13]:
#for i in range(12):
#    percip.loc[(1,i+1)] = percip.loc[(7,i+1)]
#    percip.loc[(5,i+1)] = percip.loc[(7,i+1)]
#    percip.loc[(12,i+1)] = percip.loc[(7,i+1)]
    
#    cloud.loc[(7,i+1)] = cloud.loc[(5,i+1)]
#    cloud.loc[(11,i+1)] = cloud.loc[(5,i+1)]
    
#    sealevelp.loc[(5,i+1)] = sealevelp.loc[(7,i+1)]

In [14]:
weather_info = percip.join(cloud)
weather_info = weather_info.join(airtemp)
weather_info = weather_info.join(dewtemp)
weather_info = weather_info.join(sealevelp)
weather_info = weather_info.join(windd)
weather_info = weather_info.join(winds)
weather_info.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,precip_depth_1_hr,cloud_coverage,air_temperature,dew_temperature,sea_level_pressure,wind_direction,wind_speed
site_id,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,1.796875,3.667969,14.710938,9.210938,1018.0,196.75,3.642578
0,2,0.593262,2.3125,16.140625,8.867188,1020.0,198.0,4.054688
0,3,1.740234,3.265625,21.265625,14.429688,1018.5,174.5,3.632812
0,4,0.197266,3.027344,22.4375,14.59375,1017.5,142.875,3.759766
0,5,1.214844,2.765625,24.734375,17.234375,1016.0,157.375,3.220703


In [15]:
weather_df.update(weather_info,overwrite=False)
weather_df = weather_df.reset_index()
weather_df = weather_df.drop(['datetime','day','week','month'],axis=1)
weather_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,1.796875,1019.5,0.0,0.0
1,0,2016-01-01 01:00:00,24.40625,3.667969,21.09375,-1.0,1020.0,70.0,1.5
2,0,2016-01-01 02:00:00,22.796875,2.0,21.09375,0.0,1020.0,0.0,0.0
3,0,2016-01-01 03:00:00,21.09375,2.0,20.59375,0.0,1020.0,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.599609


In [16]:
def get_meteorological_features(data):
    
    def calculate_rh(df):
        df['relative_humidity'] = 100 * (np.exp((17.625 * df['dew_temperature']) / (243.04 + df['dew_temperature'])) / np.exp((17.625 * df['air_temperature'])/(243.04 + df['air_temperature'])))
    def calculate_fl(df):
        flike_final = []
        flike = []
        # calculate Feels Like temperature
        for i in range(len(df)):
            at = df['air_temperature'][i]
            rh = df['relative_humidity'][i]
            ws = df['wind_speed'][i]
            flike.append(feels_like(Temp(at, unit = 'C'), rh, ws))
        for i in range(len(flike)):
            flike_final.append(flike[i].f)
        df['feels_like'] = flike_final
        del flike_final, flike, at, rh, ws
    calculate_rh(data)
    calculate_fl(data)
    
    return data

In [17]:
weather_df = get_meteorological_features(weather_df)
weather_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,relative_humidity,feels_like
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,1.796875,1019.5,0.0,0.0,73.8125,77.0
1,0,2016-01-01 01:00:00,24.40625,3.667969,21.09375,-1.0,1020.0,70.0,1.5,81.875,75.93125
2,0,2016-01-01 02:00:00,22.796875,2.0,21.09375,0.0,1020.0,0.0,0.0,90.1875,73.034375
3,0,2016-01-01 03:00:00,21.09375,2.0,20.59375,0.0,1020.0,0.0,0.0,97.0,69.96875
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.599609,100.0,68.0


In [18]:
def features_engineering(df):
    
    # Sort by timestamp
    df.sort_values("timestamp")
    df.reset_index(drop=True)
    
    # Add more features
    df["timestamp"] = pd.to_datetime(df["timestamp"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["timestamp"].dt.hour
    df["dayofweek"] = df["timestamp"].dt.dayofweek
    
    df['month'] = df['timestamp'].dt.month
    df['month'].replace((1, 2, 3, 4), 1, inplace = True)
    df['month'].replace((5, 6, 7, 8), 2, inplace = True)
    df['month'].replace((9, 10, 11, 12), 3, inplace = True)
  
    df['square_feet'] =  np.log1p(df['square_feet'])
    
    # Remove Unused Columns
    drop = ["timestamp"]
    df = df.drop(drop, axis=1)
    gc.collect()
    
    # Encode Categorical Data
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    return df

In [19]:
# weather_df.head()
# train_df.head()

In [20]:
weather_df['timestamp'] = pd.to_datetime(weather_df['timestamp'], format="%Y-%m-%d %H:%M:%S")

# merge data
train_df = train_df.merge(building_df, left_on='building_id',right_on='building_id',how='left')
train_df = train_df.merge(weather_df,how='left',left_on=['site_id','timestamp'],right_on=['site_id','timestamp'])
del weather_df
gc.collect()

89

In [21]:
# feature engineering
train_df = features_engineering(train_df)

# transform target variable
train_df['square_feet'] = np.log1p(train_df["square_feet"])

In [22]:
train_df = train_df.drop(['month'], axis=1)
train_df = train_df.drop(['sea_level_pressure','wind_direction','wind_speed','year_built','floor_count'], axis=1)
train_df['meter_reading'] = np.log1p(train_df['meter_reading'])
train_df.head()

Unnamed: 0,building_id,meter,meter_reading,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,relative_humidity,feels_like,hour,dayofweek
0,105,0,3.190624,1,0,2.470823,3.800781,0.0,2.400391,,90.625,36.935821,0,4
1,106,0,0.318163,1,0,2.26067,3.800781,0.0,2.400391,,90.625,36.935821,0,4
2,106,3,0.0,1,0,2.26067,3.800781,0.0,2.400391,,90.625,36.935821,0,4
3,107,0,5.171529,1,0,2.524764,3.800781,0.0,2.400391,,90.625,36.935821,0,4
4,108,0,4.524668,1,0,2.510359,3.800781,0.0,2.400391,,90.625,36.935821,0,4


## lightGBM randomsearchCV

In [23]:
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [24]:
categorical_features = ["building_id", "site_id", "meter", "primary_use",'dayofweek','hour']

mdl =  lgb.LGBMRegressor(boosting_type= 'gbdt',
          objective = 'regression',
          n_jobs = -1, # Updated from 'nthread'
          silent = 0
          #feature_fraction= 0.85,
          #reg_lambda= 2,
          #max_depth = params['max_depth'],
          #max_bin = params['max_bin'],
          #subsample_for_bin = params['subsample_for_bin'],
          #subsample = params['subsample'],
          #subsample_freq = params['subsample_freq'],
          #min_split_gain = params['min_split_gain'],
          #min_child_weight = params['min_child_weight'],
          #min_child_samples = params['min_child_samples'],
          #scale_pos_weight = params['scale_pos_weight'] 
          ) 

In [25]:
param_test ={
              "num_leaves": sp_randint(6,1280),
              "learning_rate": [0.01, 0.05, 0.1],
              "feature_fraction": [0.4,0.5,0.6,0.7,0.8,0.9,1],
              "reg_lambda": [0,0.1,1,2,7,10],
              "reg_alpha": [0,0.1,1,2,7,10],
              "subsample": [0.6,0.7,0.8,0.9,1],
              "min_child_samples": sp_randint(100, 1000)
            }

In [26]:
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2)))

from sklearn.metrics.scorer import make_scorer

my_scorer = make_scorer(rmsle, greater_is_better=False)

In [27]:
#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 100


#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
#clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)
gs = RandomizedSearchCV(
    estimator=mdl, param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    cv=3,
    scoring = my_scorer,
    refit=True,
    random_state=314,
    verbose=200)

In [22]:
#del train_df,weather_df, building_df

In [28]:
X_train = train_df.drop(['meter_reading'], axis=1)
y_train = train_df['meter_reading']

gs.fit(X_train, y_train, categorical_feature = categorical_features)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] feature_fraction=0.4, learning_rate=0.05, min_child_samples=813, num_leaves=860, reg_alpha=1, reg_lambda=0, subsample=0.8 


New categorical_feature is ['building_id', 'dayofweek', 'hour', 'meter', 'primary_use', 'site_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[CV]  feature_fraction=0.4, learning_rate=0.05, min_child_samples=813, num_leaves=860, reg_alpha=1, reg_lambda=0, subsample=0.8, score=-0.401, total= 2.6min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.6min remaining:    0.0s
[CV] feature_fraction=0.4, learning_rate=0.05, min_child_samples=813, num_leaves=860, reg_alpha=1, reg_lambda=0, subsample=0.8 


  


[CV]  feature_fraction=0.4, learning_rate=0.05, min_child_samples=813, num_leaves=860, reg_alpha=1, reg_lambda=0, subsample=0.8, score=-0.388, total= 2.7min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  5.3min remaining:    0.0s
[CV] feature_fraction=0.4, learning_rate=0.05, min_child_samples=813, num_leaves=860, reg_alpha=1, reg_lambda=0, subsample=0.8 
[CV]  feature_fraction=0.4, learning_rate=0.05, min_child_samples=813, num_leaves=860, reg_alpha=1, reg_lambda=0, subsample=0.8, score=-0.394, total= 2.9min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  8.2min remaining:    0.0s
[CV] feature_fraction=0.6, learning_rate=0.01, min_child_samples=954, num_leaves=77, reg_alpha=0, reg_lambda=2, subsample=1 
[CV]  feature_fraction=0.6, learning_rate=0.01, min_child_samples=954, num_leaves=77, reg_alpha=0, reg_lambda=2, subsample=1, score=-0.509, total= 1.8min
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  9.9min remaining:    0.0s
[CV] feature_fraction=0.6, learning_rat

[CV]  feature_fraction=0.6, learning_rate=0.05, min_child_samples=634, num_leaves=256, reg_alpha=1, reg_lambda=0.1, subsample=0.8, score=-0.382, total= 2.0min
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed: 75.4min remaining:    0.0s
[CV] feature_fraction=0.6, learning_rate=0.05, min_child_samples=634, num_leaves=256, reg_alpha=1, reg_lambda=0.1, subsample=0.8 
[CV]  feature_fraction=0.6, learning_rate=0.05, min_child_samples=634, num_leaves=256, reg_alpha=1, reg_lambda=0.1, subsample=0.8, score=-0.365, total= 2.5min
[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed: 77.9min remaining:    0.0s
[CV] feature_fraction=0.6, learning_rate=0.05, min_child_samples=634, num_leaves=256, reg_alpha=1, reg_lambda=0.1, subsample=0.8 
[CV]  feature_fraction=0.6, learning_rate=0.05, min_child_samples=634, num_leaves=256, reg_alpha=1, reg_lambda=0.1, subsample=0.8, score=-0.375, total= 2.9min
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 80.8min remaining:    0.0s
[CV] feature_fraction=0

[CV]  feature_fraction=0.5, learning_rate=0.05, min_child_samples=850, num_leaves=1210, reg_alpha=10, reg_lambda=0.1, subsample=0.9, score=-0.373, total= 3.6min
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 148.2min remaining:    0.0s
[CV] feature_fraction=0.9, learning_rate=0.01, min_child_samples=534, num_leaves=573, reg_alpha=0, reg_lambda=2, subsample=0.9 
[CV]  feature_fraction=0.9, learning_rate=0.01, min_child_samples=534, num_leaves=573, reg_alpha=0, reg_lambda=2, subsample=0.9, score=-0.457, total= 3.8min
[Parallel(n_jobs=1)]: Done  49 out of  49 | elapsed: 152.1min remaining:    0.0s
[CV] feature_fraction=0.9, learning_rate=0.01, min_child_samples=534, num_leaves=573, reg_alpha=0, reg_lambda=2, subsample=0.9 
[CV]  feature_fraction=0.9, learning_rate=0.01, min_child_samples=534, num_leaves=573, reg_alpha=0, reg_lambda=2, subsample=0.9, score=-0.437, total= 4.1min
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 156.2min remaining:    0.0s
[CV] feature_fraction=0.9,

[CV]  feature_fraction=0.6, learning_rate=0.05, min_child_samples=940, num_leaves=423, reg_alpha=1, reg_lambda=1, subsample=0.7, score=-0.358, total= 3.0min
[Parallel(n_jobs=1)]: Done  71 out of  71 | elapsed: 218.0min remaining:    0.0s
[CV] feature_fraction=0.6, learning_rate=0.05, min_child_samples=940, num_leaves=423, reg_alpha=1, reg_lambda=1, subsample=0.7 
[CV]  feature_fraction=0.6, learning_rate=0.05, min_child_samples=940, num_leaves=423, reg_alpha=1, reg_lambda=1, subsample=0.7, score=-0.372, total= 3.0min
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed: 221.0min remaining:    0.0s
[CV] feature_fraction=0.8, learning_rate=0.1, min_child_samples=149, num_leaves=117, reg_alpha=7, reg_lambda=0, subsample=1 
[CV]  feature_fraction=0.8, learning_rate=0.1, min_child_samples=149, num_leaves=117, reg_alpha=7, reg_lambda=0, subsample=1, score=-0.369, total= 2.0min
[Parallel(n_jobs=1)]: Done  73 out of  73 | elapsed: 223.0min remaining:    0.0s
[CV] feature_fraction=0.8, learning_

[CV]  feature_fraction=0.8, learning_rate=0.01, min_child_samples=479, num_leaves=160, reg_alpha=7, reg_lambda=7, subsample=0.6, score=-0.489, total= 2.8min
[Parallel(n_jobs=1)]: Done  94 out of  94 | elapsed: 280.6min remaining:    0.0s
[CV] feature_fraction=0.8, learning_rate=0.01, min_child_samples=479, num_leaves=160, reg_alpha=7, reg_lambda=7, subsample=0.6 
[CV]  feature_fraction=0.8, learning_rate=0.01, min_child_samples=479, num_leaves=160, reg_alpha=7, reg_lambda=7, subsample=0.6, score=-0.465, total= 2.5min
[Parallel(n_jobs=1)]: Done  95 out of  95 | elapsed: 283.2min remaining:    0.0s
[CV] feature_fraction=0.8, learning_rate=0.01, min_child_samples=479, num_leaves=160, reg_alpha=7, reg_lambda=7, subsample=0.6 
[CV]  feature_fraction=0.8, learning_rate=0.01, min_child_samples=479, num_leaves=160, reg_alpha=7, reg_lambda=7, subsample=0.6, score=-0.474, total= 3.1min
[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed: 286.3min remaining:    0.0s
[CV] feature_fraction=0.5, lea

[CV]  feature_fraction=0.4, learning_rate=0.1, min_child_samples=969, num_leaves=953, reg_alpha=0, reg_lambda=1, subsample=0.8, score=-0.374, total= 3.3min
[Parallel(n_jobs=1)]: Done 117 out of 117 | elapsed: 356.6min remaining:    0.0s
[CV] feature_fraction=1, learning_rate=0.01, min_child_samples=523, num_leaves=771, reg_alpha=10, reg_lambda=0.1, subsample=0.7 
[CV]  feature_fraction=1, learning_rate=0.01, min_child_samples=523, num_leaves=771, reg_alpha=10, reg_lambda=0.1, subsample=0.7, score=-0.448, total= 4.3min
[Parallel(n_jobs=1)]: Done 118 out of 118 | elapsed: 360.8min remaining:    0.0s
[CV] feature_fraction=1, learning_rate=0.01, min_child_samples=523, num_leaves=771, reg_alpha=10, reg_lambda=0.1, subsample=0.7 
[CV]  feature_fraction=1, learning_rate=0.01, min_child_samples=523, num_leaves=771, reg_alpha=10, reg_lambda=0.1, subsample=0.7, score=-0.430, total= 4.8min
[Parallel(n_jobs=1)]: Done 119 out of 119 | elapsed: 365.6min remaining:    0.0s
[CV] feature_fraction=1, le

[CV]  feature_fraction=0.7, learning_rate=0.05, min_child_samples=755, num_leaves=1027, reg_alpha=10, reg_lambda=1, subsample=1, score=-0.347, total= 3.4min
[Parallel(n_jobs=1)]: Done 140 out of 140 | elapsed: 436.2min remaining:    0.0s
[CV] feature_fraction=0.7, learning_rate=0.05, min_child_samples=755, num_leaves=1027, reg_alpha=10, reg_lambda=1, subsample=1 
[CV]  feature_fraction=0.7, learning_rate=0.05, min_child_samples=755, num_leaves=1027, reg_alpha=10, reg_lambda=1, subsample=1, score=-0.368, total= 4.7min
[Parallel(n_jobs=1)]: Done 141 out of 141 | elapsed: 440.9min remaining:    0.0s
[CV] feature_fraction=1, learning_rate=0.1, min_child_samples=250, num_leaves=1228, reg_alpha=0, reg_lambda=0, subsample=0.6 
[CV]  feature_fraction=1, learning_rate=0.1, min_child_samples=250, num_leaves=1228, reg_alpha=0, reg_lambda=0, subsample=0.6, score=-0.367, total= 4.1min
[Parallel(n_jobs=1)]: Done 142 out of 142 | elapsed: 445.0min remaining:    0.0s
[CV] feature_fraction=1, learning_

[CV]  feature_fraction=0.4, learning_rate=0.01, min_child_samples=955, num_leaves=147, reg_alpha=10, reg_lambda=10, subsample=0.7, score=-0.520, total= 2.1min
[Parallel(n_jobs=1)]: Done 163 out of 163 | elapsed: 510.5min remaining:    0.0s
[CV] feature_fraction=0.4, learning_rate=0.01, min_child_samples=955, num_leaves=147, reg_alpha=10, reg_lambda=10, subsample=0.7 
[CV]  feature_fraction=0.4, learning_rate=0.01, min_child_samples=955, num_leaves=147, reg_alpha=10, reg_lambda=10, subsample=0.7, score=-0.497, total= 2.1min
[Parallel(n_jobs=1)]: Done 164 out of 164 | elapsed: 512.6min remaining:    0.0s
[CV] feature_fraction=0.4, learning_rate=0.01, min_child_samples=955, num_leaves=147, reg_alpha=10, reg_lambda=10, subsample=0.7 
[CV]  feature_fraction=0.4, learning_rate=0.01, min_child_samples=955, num_leaves=147, reg_alpha=10, reg_lambda=10, subsample=0.7, score=-0.507, total= 2.3min
[Parallel(n_jobs=1)]: Done 165 out of 165 | elapsed: 514.9min remaining:    0.0s
[CV] feature_fractio

[CV]  feature_fraction=0.5, learning_rate=0.05, min_child_samples=845, num_leaves=1128, reg_alpha=2, reg_lambda=7, subsample=0.9, score=-0.373, total= 4.0min
[Parallel(n_jobs=1)]: Done 186 out of 186 | elapsed: 584.8min remaining:    0.0s
[CV] feature_fraction=0.7, learning_rate=0.01, min_child_samples=904, num_leaves=1078, reg_alpha=7, reg_lambda=0, subsample=0.9 
[CV]  feature_fraction=0.7, learning_rate=0.01, min_child_samples=904, num_leaves=1078, reg_alpha=7, reg_lambda=0, subsample=0.9, score=-0.465, total= 4.5min
[Parallel(n_jobs=1)]: Done 187 out of 187 | elapsed: 589.3min remaining:    0.0s
[CV] feature_fraction=0.7, learning_rate=0.01, min_child_samples=904, num_leaves=1078, reg_alpha=7, reg_lambda=0, subsample=0.9 
[CV]  feature_fraction=0.7, learning_rate=0.01, min_child_samples=904, num_leaves=1078, reg_alpha=7, reg_lambda=0, subsample=0.9, score=-0.447, total= 4.0min
[Parallel(n_jobs=1)]: Done 188 out of 188 | elapsed: 593.4min remaining:    0.0s
[CV] feature_fraction=0.7

[CV]  feature_fraction=0.8, learning_rate=0.01, min_child_samples=909, num_leaves=201, reg_alpha=1, reg_lambda=0.1, subsample=0.6, score=-0.484, total= 2.6min
[CV] feature_fraction=0.8, learning_rate=0.01, min_child_samples=909, num_leaves=201, reg_alpha=1, reg_lambda=0.1, subsample=0.6 
[CV]  feature_fraction=0.8, learning_rate=0.01, min_child_samples=909, num_leaves=201, reg_alpha=1, reg_lambda=0.1, subsample=0.6, score=-0.462, total= 2.7min
[CV] feature_fraction=0.8, learning_rate=0.01, min_child_samples=909, num_leaves=201, reg_alpha=1, reg_lambda=0.1, subsample=0.6 
[CV]  feature_fraction=0.8, learning_rate=0.01, min_child_samples=909, num_leaves=201, reg_alpha=1, reg_lambda=0.1, subsample=0.6, score=-0.472, total= 2.9min
[CV] feature_fraction=0.4, learning_rate=0.05, min_child_samples=826, num_leaves=454, reg_alpha=7, reg_lambda=7, subsample=0.6 
[CV]  feature_fraction=0.4, learning_rate=0.05, min_child_samples=826, num_leaves=454, reg_alpha=7, reg_lambda=7, subsample=0.6, score=

[CV]  feature_fraction=0.4, learning_rate=0.01, min_child_samples=397, num_leaves=93, reg_alpha=7, reg_lambda=1, subsample=1, score=-0.510, total= 2.0min
[CV] feature_fraction=0.5, learning_rate=0.01, min_child_samples=598, num_leaves=1208, reg_alpha=0, reg_lambda=7, subsample=0.8 
[CV]  feature_fraction=0.5, learning_rate=0.01, min_child_samples=598, num_leaves=1208, reg_alpha=0, reg_lambda=7, subsample=0.8, score=-0.482, total= 3.5min
[CV] feature_fraction=0.5, learning_rate=0.01, min_child_samples=598, num_leaves=1208, reg_alpha=0, reg_lambda=7, subsample=0.8 
[CV]  feature_fraction=0.5, learning_rate=0.01, min_child_samples=598, num_leaves=1208, reg_alpha=0, reg_lambda=7, subsample=0.8, score=-0.463, total= 3.8min
[CV] feature_fraction=0.5, learning_rate=0.01, min_child_samples=598, num_leaves=1208, reg_alpha=0, reg_lambda=7, subsample=0.8 
[CV]  feature_fraction=0.5, learning_rate=0.01, min_child_samples=598, num_leaves=1208, reg_alpha=0, reg_lambda=7, subsample=0.8, score=-0.472,

[CV]  feature_fraction=0.6, learning_rate=0.01, min_child_samples=723, num_leaves=149, reg_alpha=0, reg_lambda=0, subsample=0.6, score=-0.476, total= 2.5min
[CV] feature_fraction=0.6, learning_rate=0.01, min_child_samples=723, num_leaves=149, reg_alpha=0, reg_lambda=0, subsample=0.6 
[CV]  feature_fraction=0.6, learning_rate=0.01, min_child_samples=723, num_leaves=149, reg_alpha=0, reg_lambda=0, subsample=0.6, score=-0.486, total= 2.6min
[CV] feature_fraction=1, learning_rate=0.01, min_child_samples=787, num_leaves=737, reg_alpha=0.1, reg_lambda=10, subsample=0.6 
[CV]  feature_fraction=1, learning_rate=0.01, min_child_samples=787, num_leaves=737, reg_alpha=0.1, reg_lambda=10, subsample=0.6, score=-0.449, total= 4.3min
[CV] feature_fraction=1, learning_rate=0.01, min_child_samples=787, num_leaves=737, reg_alpha=0.1, reg_lambda=10, subsample=0.6 
[CV]  feature_fraction=1, learning_rate=0.01, min_child_samples=787, num_leaves=737, reg_alpha=0.1, reg_lambda=10, subsample=0.6, score=-0.431

[CV]  feature_fraction=0.9, learning_rate=0.01, min_child_samples=286, num_leaves=1218, reg_alpha=2, reg_lambda=10, subsample=0.8, score=-0.444, total= 5.4min
[CV] feature_fraction=0.9, learning_rate=0.01, min_child_samples=286, num_leaves=1218, reg_alpha=2, reg_lambda=10, subsample=0.8 
[CV]  feature_fraction=0.9, learning_rate=0.01, min_child_samples=286, num_leaves=1218, reg_alpha=2, reg_lambda=10, subsample=0.8, score=-0.427, total= 5.2min
[CV] feature_fraction=0.9, learning_rate=0.01, min_child_samples=286, num_leaves=1218, reg_alpha=2, reg_lambda=10, subsample=0.8 
[CV]  feature_fraction=0.9, learning_rate=0.01, min_child_samples=286, num_leaves=1218, reg_alpha=2, reg_lambda=10, subsample=0.8, score=-0.436, total= 5.7min
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed: 970.5min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=LGBMRegressor(boosting_type='gbdt',
                                           class_weight=None,
                                           colsample_bytree=1.0,
                                           importance_type='split',
                                           learning_rate=0.1, max_depth=-1,
                                           min_child_samples=20,
                                           min_child_weight=0.001,
                                           min_split_gain=0.0, n_estimators=100,
                                           n_jobs=-1, num_leaves=31,
                                           objective='regression',
                                           random_state=None, reg_alpha=...
                                        'min_child_samples': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001FC19CABDA0>,
                                        'num

In [29]:
print(gs.best_params_)

{'feature_fraction': 0.9, 'learning_rate': 0.05, 'min_child_samples': 553, 'num_leaves': 1092, 'reg_alpha': 0, 'reg_lambda': 0.1, 'subsample': 0.7}


In [30]:
best_mdl = gs.best_estimator_
best_mdl

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              feature_fraction=0.9, importance_type='split', learning_rate=0.05,
              max_depth=-1, min_child_samples=553, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=1092,
              objective='regression', random_state=None, reg_alpha=0,
              reg_lambda=0.1, silent=0, subsample=0.7, subsample_for_bin=200000,
              subsample_freq=0)

In [31]:
results = pd.DataFrame(gs.cv_results_)
results.head(150)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_feature_fraction,param_learning_rate,param_min_child_samples,param_num_leaves,param_reg_alpha,param_reg_lambda,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,117.348981,4.28495,45.813446,2.578928,0.4,0.05,813,860,1.0,0.0,0.8,"{'feature_fraction': 0.4, 'learning_rate': 0.0...",-0.401045,-0.388379,-0.393592,-0.394338,0.005198,60
1,74.787462,1.728945,28.639091,0.500438,0.6,0.01,954,77,0.0,2.0,1.0,"{'feature_fraction': 0.6, 'learning_rate': 0.0...",-0.509492,-0.484588,-0.494892,-0.496324,0.010218,95
2,242.128626,20.113821,49.342728,1.444292,1.0,0.01,104,934,2.0,7.0,0.6,"{'feature_fraction': 1, 'learning_rate': 0.01,...",-0.444789,-0.42784,-0.43585,-0.43616,0.006923,66
3,144.129398,6.254157,50.515705,2.837559,0.4,0.01,211,835,10.0,1.0,0.9,"{'feature_fraction': 0.4, 'learning_rate': 0.0...",-0.507739,-0.4867,-0.496028,-0.496822,0.008607,96
4,125.322912,2.471132,35.560721,0.834106,0.7,0.01,748,233,1.0,0.1,0.6,"{'feature_fraction': 0.7, 'learning_rate': 0.0...",-0.486628,-0.465884,-0.475263,-0.475925,0.008482,88
5,181.478473,11.375203,52.566665,4.112759,0.9,0.05,289,1020,10.0,2.0,0.9,"{'feature_fraction': 0.9, 'learning_rate': 0.0...",-0.363841,-0.340956,-0.36665,-0.357149,0.011507,2
6,89.225371,8.461391,32.034931,0.256247,0.5,0.05,488,143,2.0,1.0,0.6,"{'feature_fraction': 0.5, 'learning_rate': 0.0...",-0.399425,-0.380525,-0.384536,-0.388162,0.008131,59
7,147.872709,0.81215,48.874631,1.942184,0.9,0.1,224,885,1.0,0.0,0.6,"{'feature_fraction': 0.9, 'learning_rate': 0.1...",-0.366024,-0.338947,-0.369856,-0.358276,0.013756,6
8,107.838387,19.484845,40.636893,1.530686,0.6,0.05,634,256,1.0,0.1,0.8,"{'feature_fraction': 0.6, 'learning_rate': 0.0...",-0.382126,-0.364786,-0.375064,-0.373992,0.00712,57
9,195.050645,9.614584,59.297577,1.491574,0.7,0.05,801,1074,0.0,1.0,0.7,"{'feature_fraction': 0.7, 'learning_rate': 0.0...",-0.365669,-0.346112,-0.367443,-0.359741,0.009665,20


In [32]:
gs.best_score_

-0.35684080701187665

In [46]:
test_df = import_data('test.csv')
building_df = import_data('building_metadata.csv')
weather_df = import_data('weather_test.csv')

Memory usage of dataframe is 1272.51 MB
Memory usage after optimization is: 358.65 MB
Decreased by 71.8%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.8%
Memory usage of dataframe is 19.04 MB
Memory usage after optimization is: 5.25 MB
Decreased by 72.4%


In [47]:
#test_df = test_df [ test_df['building_id'] != 1099 ]
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])
#test_df = test_df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

In [48]:
weather_df = fill_weather_dataset(weather_df)

percip = weather_df.groupby(['site_id','month'])[['precip_depth_1_hr']].mean()
cloud = weather_df.groupby(['site_id','month'])[['cloud_coverage']].mean()
airtemp = weather_df.groupby(['site_id','month'])[['air_temperature']].mean()
dewtemp = weather_df.groupby(['site_id','month'])[['dew_temperature']].mean()
sealevelp = weather_df.groupby(['site_id','month'])[['sea_level_pressure']].mean()
windd = weather_df.groupby(['site_id','month'])[['wind_direction']].mean()
winds = weather_df.groupby(['site_id','month'])[['wind_speed']].mean()

In [34]:
pd.set_option('display.max_rows', None)
# percip
# cloud
# airtemp
# dewtemp
# sealevelp
# windd
# winds

Unnamed: 0_level_0,Unnamed: 1_level_0,precip_depth_1_hr
site_id,month,Unnamed: 2_level_1
0,1,0.654297
0,2,0.229858
0,3,0.134399
0,4,0.354004
0,5,1.537109
0,6,2.132812
0,7,2.560547
0,8,1.849609
0,9,2.21875
0,10,0.784668


In [35]:
for i in range(12):
    percip.loc[(1,i+1)] = percip.loc[(7,i+1)]
    percip.loc[(5,i+1)] = percip.loc[(7,i+1)]
    percip.loc[(12,i+1)] = percip.loc[(7,i+1)]
    
    cloud.loc[(7,i+1)] = cloud.loc[(5,i+1)]
    cloud.loc[(11,i+1)] = cloud.loc[(5,i+1)]
    
    sealevelp.loc[(5,i+1)] = sealevelp.loc[(7,i+1)]

In [49]:
weather_info = percip.join(cloud)
weather_info = weather_info.join(airtemp)
weather_info = weather_info.join(dewtemp)
weather_info = weather_info.join(sealevelp)
weather_info = weather_info.join(windd)
weather_info = weather_info.join(winds)
weather_info.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,precip_depth_1_hr,cloud_coverage,air_temperature,dew_temperature,sea_level_pressure,wind_direction,wind_speed
site_id,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,0.654297,2.929688,15.96875,10.429688,1021.5,182.375,3.982422
0,2,0.229858,2.623047,20.5625,14.945312,1021.0,145.125,3.306641
0,3,0.134399,2.160156,19.296875,10.070312,1020.0,171.75,3.962891
0,4,0.354004,2.589844,22.859375,15.226562,1017.0,152.25,4.082031
0,5,1.537109,3.021484,24.390625,18.78125,1016.0,145.625,3.808594


In [50]:
weather_df.update(weather_info,overwrite=False)
weather_df = weather_df.reset_index()
weather_df = weather_df.drop(['datetime','day','week','month'],axis=1)
weather_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2017-01-01 00:00:00,17.796875,4.0,11.703125,0.654297,1021.5,100.0,3.599609
1,0,2017-01-01 01:00:00,17.796875,2.0,12.796875,0.0,1022.0,130.0,3.099609
2,0,2017-01-01 02:00:00,16.09375,0.0,12.796875,0.0,1022.0,140.0,3.099609
3,0,2017-01-01 03:00:00,17.203125,0.0,13.296875,0.0,1022.0,140.0,3.099609
4,0,2017-01-01 04:00:00,16.703125,2.0,13.296875,0.0,1022.5,130.0,2.599609


In [51]:
weather_df = get_meteorological_features(weather_df)
weather_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,relative_humidity,feels_like
0,0,2017-01-01 00:00:00,17.796875,4.0,11.703125,0.654297,1021.5,100.0,3.599609,67.4375,64.034375
1,0,2017-01-01 01:00:00,17.796875,2.0,12.796875,0.0,1022.0,130.0,3.099609,72.5625,64.034375
2,0,2017-01-01 02:00:00,16.09375,0.0,12.796875,0.0,1022.0,140.0,3.099609,80.75,60.96875
3,0,2017-01-01 03:00:00,17.203125,0.0,13.296875,0.0,1022.0,140.0,3.099609,77.875,62.965625
4,0,2017-01-01 04:00:00,16.703125,2.0,13.296875,0.0,1022.5,130.0,2.599609,80.3125,62.065625


In [52]:
weather_df['timestamp'] = pd.to_datetime(weather_df['timestamp'], format="%Y-%m-%d %H:%M:%S")

# merge data
test_df = test_df.merge(building_df, left_on='building_id',right_on='building_id',how='left')
test_df = test_df.merge(weather_df,how='left',left_on=['site_id','timestamp'],right_on=['site_id','timestamp'])
del weather_df
gc.collect()

76

In [53]:
# feature engineering
test_df = features_engineering(test_df)

# transform target variable
test_df['square_feet'] = np.log1p(test_df["square_feet"])

test_df = test_df.drop(['month'], axis=1)
test_df = test_df.drop(['sea_level_pressure','wind_direction','wind_speed','year_built','floor_count'], axis=1)
test_df.head()

Unnamed: 0,row_id,building_id,meter,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,relative_humidity,feels_like,hour,dayofweek
0,0,0,0,0,0,2.293916,17.796875,4.0,11.703125,0.654297,67.4375,64.034375,0,6
1,1,1,0,0,0,2.187034,17.796875,4.0,11.703125,0.654297,67.4375,64.034375,0,6
2,2,2,0,0,0,2.260709,17.796875,4.0,11.703125,0.654297,67.4375,64.034375,0,6
3,3,3,0,0,0,2.404477,17.796875,4.0,11.703125,0.654297,67.4375,64.034375,0,6
4,4,4,0,0,0,2.538966,17.796875,4.0,11.703125,0.654297,67.4375,64.034375,0,6


In [None]:
#{'feature_fraction': 1, 
# 'learning_rate': 0.1, 
# 'min_child_samples': 250, 
# 'num_leaves': 1228, 
# 'reg_alpha': 0, 
# 'reg_lambda': 0, 
# 'subsample': 0.6}

# THIS -RMLSE
#{'feature_fraction': 0.9, 
# 'learning_rate': 0.05, 
# 'min_child_samples': 553, 
# 'num_leaves': 1092, 'reg_alpha': 0, 
# 'reg_lambda': 0.1, 'subsample': 0.7}


In [41]:
categorical_features = ["building_id", "site_id", "meter", "primary_use",'dayofweek','hour']


lgb_train = lgb.Dataset(X_train, y_train, categorical_feature = categorical_features)

params = {
    
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': "rmlse",
    'feature_fraction': 0.9,
    'learning_rate': 0.05,
    'min_child_samples': 553,
    'num_leaves': 1092,
    'reg_alpha': 0,
    'reg_lambda': 0.1,
    'subsample': 0.7
}

gbm = lgb.train(params, lgb_train)



In [43]:
categorical_features = ["building_id", "site_id", "meter", "primary_use",'dayofweek','hour']


gbm = lgb.LGBMRegressor(boosting_type='gbdt',
                       subsample=0.7,
                       learning_rate=0.05,
                       min_child_samples=553,
                       num_leaves=1092,
                       reg_alpha=0,
                       reg_lambda=0.1,
                       colsample_bytree=0.9,
                       random_state=314)

# THIS -RMLSE
#{'feature_fraction': 0.9, 
# 'learning_rate': 0.05, 
# 'min_child_samples': 553, 
# 'num_leaves': 1092, 'reg_alpha': 0, 
# 'reg_lambda': 0.1, 'subsample': 0.7}


gbm.fit(X_train, y_train, categorical_feature=categorical_features)

New categorical_feature is ['building_id', 'dayofweek', 'hour', 'meter', 'primary_use', 'site_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
              importance_type='split', learning_rate=0.05, max_depth=-1,
              min_child_samples=553, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=1092, objective=None,
              random_state=314, reg_alpha=0, reg_lambda=0.1, silent=True,
              subsample=0.7, subsample_for_bin=200000, subsample_freq=0)

In [54]:
X_test.head()

NameError: name 'X_test' is not defined

In [55]:
X_test = test_df.drop(['row_id'],axis=1)
gbm_results = np.expm1(gbm.predict(X_test))

len(gbm_results)

41697600

In [56]:
row_ids = test_df['row_id']

results_df = pd.DataFrame({"row_id": row_ids, "meter_reading":gbm_results})
results_df.head()

Unnamed: 0,row_id,meter_reading
0,0,188.450306
1,1,96.434727
2,2,8.194458
3,3,304.30025
4,4,1399.047062


In [57]:
results_df.to_csv('results_df_rmsle_.csv',index=False)
results.to_csv('rsCV_result_rmsle_.csv',index=False)