In [1]:
import gc
import os
import random
import sys

from tqdm import tqdm_notebook as tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
import statistics
from IPython.core.display import display, HTML

from math import sqrt
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')




# --- plotly ---
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

# --- models ---
#from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
#import catboost as cb

In [2]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def memory_reduction(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'
          .format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
merged_train = pd.read_feather('../input/energy-feature-engineering-2/merged_train.feather')

In [4]:
merged_train.sort_values("timestamp")
merged_train.reset_index(drop=True)

Unnamed: 0,index,building_id,meter,timestamp,meter_reading_log1p,weekend,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,IsHoliday,month_group,horizsolar,Sensible_Heat,relative_humidity
0,0,105,0,2016-01-01 00:00:00,3.190624,4,1,0,10.832181,3.8,0.0,2.4,,1.0,1,0.0,0.040676,90.575539
1,1,106,0,2016-01-01 00:00:00,0.318163,4,1,0,8.589514,3.8,0.0,2.4,,1.0,1,0.0,0.032254,90.575539
2,2,106,3,2016-01-01 00:00:00,0.000000,4,1,0,8.589514,3.8,0.0,2.4,,1.0,1,0.0,0.032254,90.575539
3,3,107,0,2016-01-01 00:00:00,5.171529,4,1,0,11.487947,3.8,0.0,2.4,,1.0,1,0.0,0.043138,90.575539
4,4,108,0,2016-01-01 00:00:00,4.524668,4,1,0,11.309352,3.8,0.0,2.4,,1.0,1,0.0,0.042468,90.575539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19852315,19852315,1444,0,2016-12-31 23:00:00,2.277267,5,15,3,9.884305,,,,,,3,0.0,,
19852316,19852316,1445,0,2016-12-31 23:00:00,1.762159,5,15,0,8.366138,,,,,,3,0.0,,
19852317,19852317,1446,0,2016-12-31 23:00:00,0.000000,5,15,3,9.329545,,,,,,3,0.0,,
19852318,19852318,1447,0,2016-12-31 23:00:00,5.078761,5,15,1,10.301458,,,,,,3,0.0,,


In [5]:
merged_train.columns

Index(['index', 'building_id', 'meter', 'timestamp', 'meter_reading_log1p',
       'weekend', 'site_id', 'primary_use', 'square_feet', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'IsHoliday',
       'month_group', 'horizsolar', 'Sensible_Heat', 'relative_humidity'],
      dtype='object')

In [6]:
merged_train["hour"] = merged_train["timestamp"].dt.hour #converting timestamp to hour
#merged_train["month"] = merged_train["timestamp"].dt.month
#merged_train["dayofweek"] = merged_train["timestamp"].dt.dayofweek

In [7]:
merged_train = merged_train.drop(['index',"timestamp"],axis=1)

In [8]:
merged_train.columns

Index(['building_id', 'meter', 'meter_reading_log1p', 'weekend', 'site_id',
       'primary_use', 'square_feet', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'IsHoliday', 'month_group',
       'horizsolar', 'Sensible_Heat', 'relative_humidity', 'hour'],
      dtype='object')

In [9]:

folds = 6
seed = 555
shuffle = False
kf = KFold(n_splits=folds, shuffle=shuffle, random_state=seed)

In [10]:
for train_idx, valid_idx in kf.split(merged_train):
    train_data = merged_train.iloc[train_idx,:]
    valid_data = merged_train.iloc[valid_idx,:]


In [11]:
train_data.shape

(16543600, 17)

In [12]:
valid_data.shape

(3308720, 17)

In [13]:
del merged_train

In [14]:
yy_valid = valid_data['meter_reading_log1p']
xx_valid = valid_data.drop(['meter_reading_log1p'], axis=1)
del valid_data

In [15]:
xx_valid

Unnamed: 0,building_id,meter,weekend,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,IsHoliday,month_group,horizsolar,Sensible_Heat,relative_humidity,hour
16543600,259,0,2,2,0,11.257646,23.299999,5.0,2.8,0.0,0.0,3,0.0,0.030696,26.154922,20
16543601,259,1,2,2,0,11.257646,23.299999,5.0,2.8,0.0,0.0,3,0.0,0.030696,26.154922,20
16543602,260,0,2,2,0,11.751832,23.299999,5.0,2.8,0.0,0.0,3,0.0,0.032043,26.154922,20
16543603,260,1,2,2,0,11.751832,23.299999,5.0,2.8,0.0,0.0,3,0.0,0.032043,26.154922,20
16543604,260,3,2,2,0,11.751832,23.299999,5.0,2.8,0.0,0.0,3,0.0,0.032043,26.154922,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19852315,1444,0,5,15,3,9.884305,,,,,,3,0.0,,,23
19852316,1445,0,5,15,0,8.366138,,,,,,3,0.0,,,23
19852317,1446,0,5,15,3,9.329545,,,,,,3,0.0,,,23
19852318,1447,0,5,15,1,10.301458,,,,,,3,0.0,,,23


In [16]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from tqdm import tqdm
import joblib
import pickle
#filename = 'regressor0.sav'
#pickle.dump(regressor, open(filename, 'wb')) 

In [17]:
import lightgbm as lgb
valid_pred=[]
#test_pred=[]
for i in tqdm(range(0,15)):
    df=train_data.sample(n=12000000,replace = True) 
    
    y_train_fit=df['meter_reading_log1p']
    train_fit=df.drop(['meter_reading_log1p'], axis=1)

    #device = 'gpu',gpu_device_id= 1,
    regressor=lgb.LGBMRegressor(num_leaves=255,feature_fraction = 0.9,reg_lambda= 2,metric="rmse",learning_rate = 0.05,boosting_type='gbdt',n_estimators=1500,max_depth=13,objective='regression',n_jobs=-1)

    regressor.fit(train_fit,y_train_fit)
    
    filename = 'lgb_reg'+ str(i)+'.sav'
    pickle.dump(regressor, open(filename, 'wb'))
    predict_valid =  regressor.predict(xx_valid)
                                           
    #predict_test =  regressor.predict(X_test)
    k=sqrt(mean_squared_error(yy_valid, predict_valid))
    print(k)

   
    valid_pred.append(predict_valid)
    #test_pred.append(predict_test)
    
    del train_fit,regressor, k ,df,y_train_fit,predict_valid
     
                                                                                        

  7%|▋         | 1/15 [21:49<5:05:28, 1309.19s/it]

0.9807582780228233


 13%|█▎        | 2/15 [43:26<4:42:52, 1305.56s/it]

0.9805510557813775


 20%|██        | 3/15 [1:05:04<4:20:38, 1303.25s/it]

0.9812798430670716


 27%|██▋       | 4/15 [1:26:44<3:58:44, 1302.24s/it]

0.977617236415025


 33%|███▎      | 5/15 [1:48:31<3:37:17, 1303.76s/it]

0.9812057491660351


 40%|████      | 6/15 [2:10:10<3:15:20, 1302.29s/it]

0.9776525785212831


 47%|████▋     | 7/15 [2:32:12<2:54:25, 1308.21s/it]

0.9767983274020188


 53%|█████▎    | 8/15 [2:53:50<2:32:16, 1305.28s/it]

0.9780162432015006


 60%|██████    | 9/15 [3:15:34<2:10:29, 1304.95s/it]

0.9817892753285583


 67%|██████▋   | 10/15 [3:37:35<1:49:08, 1309.72s/it]

0.9768269633545543


 73%|███████▎  | 11/15 [3:59:06<1:26:55, 1303.99s/it]

0.9781237180446286


 80%|████████  | 12/15 [4:20:50<1:05:11, 1303.99s/it]

0.9777316651633127


 87%|████████▋ | 13/15 [4:42:45<43:34, 1307.39s/it]  

0.9724930386082853


 93%|█████████▎| 14/15 [5:04:30<21:46, 1306.72s/it]

0.9785885459327468


100%|██████████| 15/15 [5:26:21<00:00, 1305.42s/it]

0.974180997943618





In [18]:
import joblib
joblib.dump(valid_pred, 'valid_pred_nan15__15lgb.pkl') 
joblib.dump(yy_valid, 'yy__valid_nan15_lgb.pkl') 
#joblib.dump(test_pred, 'test_pred2__100lgb.pkl')        

['yy__valid_nan15_lgb.pkl']