In [1]:
import gc
import os
import random
import sys

from tqdm import tqdm_notebook as tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
import statistics
from IPython.core.display import display, HTML

from math import sqrt
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')



# --- plotly ---
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

# --- models ---
#from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
#import catboost as cb

In [2]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def memory_reduction(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'
          .format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
merged_test = pd.read_feather('../input/energy-feature-engineering-2/merged_test.feather')

In [4]:
merged_test.shape

(41697600, 18)

In [5]:
merged_test["hour"] = merged_test["timestamp"].dt.hour

In [6]:
merged_test = merged_test.drop(['index','row_id','timestamp'],axis=1)

In [7]:
merged_test.columns

Index(['building_id', 'meter', 'weekend', 'site_id', 'primary_use',
       'square_feet', 'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'IsHoliday', 'month_group', 'horizsolar',
       'Sensible_Heat', 'relative_humidity', 'hour'],
      dtype='object')

In [8]:
X_test = memory_reduction(merged_test,use_float16=False)

Memory usage of dataframe is 2028.06 MB
Memory usage after optimization is: 1749.70 MB
Decreased by 13.7%


In [9]:
del merged_test

In [10]:
X_test.shape

(41697600, 16)

In [11]:
X_test.columns

Index(['building_id', 'meter', 'weekend', 'site_id', 'primary_use',
       'square_feet', 'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'IsHoliday', 'month_group', 'horizsolar',
       'Sensible_Heat', 'relative_humidity', 'hour'],
      dtype='object')

In [12]:
def pred(X_test, models, batch_size=2000000):
    iterations = (X_test.shape[0] + batch_size -1) // batch_size
    print('iterations', iterations)

    y_test_pred_total = np.zeros(X_test.shape[0])
    
        
    for k in tqdm(range(iterations)):
        y_pred_test = models.predict(X_test[k*batch_size:(k+1)*batch_size])
        y_test_pred_total[k*batch_size:(k+1)*batch_size] += y_pred_test

    return y_test_pred_total

In [13]:
import pickle
test_pred = []
for i in tqdm(range(0,8)) : 
    print("predicting model :",i)
    
    filename = "../input/training-15modelsforstacking/lgb_reg"+ str(i) +".sav"
    regressor = pickle.load(open(filename,'rb'))
    
    predict_test = pred(X_test,regressor)
    
    #predict_test =  regressor.predict(X_test)
    test_pred.append(predict_test)
       
                   

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

predicting model : 0
iterations 21


HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))


predicting model : 1
iterations 21


HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))


predicting model : 2
iterations 21


HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))


predicting model : 3
iterations 21


HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))


predicting model : 4
iterations 21


HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))


predicting model : 5
iterations 21


HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))


predicting model : 6
iterations 21


HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))


predicting model : 7
iterations 21


HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))





In [14]:
import joblib      
joblib.dump(test_pred, 'test_pred_1to7_lgb.pkl')                            

['test_pred_1to7_lgb.pkl']

In [15]:
import joblib 
#valid_pred = joblib.load('../input/training-10models/newvalid_pred_new10__10lgb.pkl') 
#yy_valid =joblib.load('../input/training-10models/newyy__valid_new10_lgb.pkl')
#test_pred =joblib.load('../input/testing-10models-1/newtest_pred_new10__10lgb.pkl')

In [16]:
#print(len(test_pred))  

In [17]:
#valid_pre = np.array(valid_pred)
#valid_pre  = np.transpose(valid_pre)
#valid_pre.shape

In [18]:
#import lightgbm as lgb

#regressor_meta=lgb.LGBMRegressor(num_leaves=31,boosting_type='gbdt', n_estimators=500,max_depth=5,objective='regression',n_jobs=-1)

#regressor_meta.fit(valid_pre,yy_valid)

In [19]:
#test_pred = np.array(test_pred)
#test_pre  = np.transpose(test_pred)
#test_pre.shape

In [20]:
#predict_y =  regressor_meta.predict(test_pre)

In [21]:
#pred = np.expm1(predict_y)         

In [22]:
#len(pred)

In [23]:
#sample_submission = pd.read_feather("../input/ashrae-feather-format-for-fast-loading/sample_submission.feather")
#sample_submission = memory_reduction(sample_submission)            

In [24]:
#del test_pred,valid_pred,predict_y  

In [25]:
#row_ids = merged_test["row_id"]

In [26]:
#sample_submission = pd.DataFrame({"row_id": row_ids, "meter_reading": np.clip(pred, 0, a_max=None)})
#sample_submission['meter_reading'] = sample_submission['meter_reading'].astype('float32')
#sample_submission['row_id'] = sample_submission['row_id'].astype('int32')

In [27]:
#sample_submission.head(10) 

In [28]:
#sample_submission.to_csv('testing_10models.csv',index=False)

In [29]:
from IPython.display import FileLink
#FileLink(r'./testing_10models.csv')                   

In [30]:
#sample_submission.shape                                                                  