In [None]:
import pandas as pd 
import numpy as np
from tqdm import tqdm_notebook 
import gc

In [None]:
path = 'C:/Users/choco/Desktop/Github/Dacon/Energy Bigdata Utilization/'
train = pd.read_csv(path + "input/train2.csv")
test = pd.read_csv(path + "input/test2.csv")
submission = pd.read_csv(path + "input/submission_1002.csv")
test_df = pd.read_csv(path + "input/test_df.csv")

## Make a submission

In [None]:
import datetime

sub_df = pd.DataFrame()
date_list_day = [datetime.datetime(2018,7,1,0) + datetime.timedelta(hours=x) for x in range(24)]
date_list = []
meter_id = []
for i in tqdm_notebook(test_df['meter_id'].unique()):
    date_list = np.append(date_list, date_list_day)
    meter_id = np.append(meter_id, np.array([i] * 24))

sub_df['time'] = date_list
sub_df['meter_id'] = meter_id

In [None]:
import datetime

sub_df_day = pd.DataFrame()
date_list_day = [datetime.datetime(2018,7,1,0) + datetime.timedelta(hours=x) for x in range(24*10)]
date_list = []
meter_id = []
for i in tqdm_notebook(test_df['meter_id'].unique()):
    date_list = np.append(date_list, date_list_day)
    meter_id = np.append(meter_id, np.array([i] * 24 * 10))

sub_df_day['time'] = date_list
sub_df_day['meter_id'] = meter_id

In [None]:
import datetime

sub_df_month = pd.DataFrame()
date_list_month = [datetime.datetime(2018,7,1,0) + datetime.timedelta(hours=x) for x in range(24 * 153)]
date_list = []
meter_id = []
for i in tqdm_notebook(test_df['meter_id'].unique()):
    date_list = np.append(date_list, date_list_month)
    meter_id = np.append(meter_id, np.array([i] * 24 * 153))

sub_df_month['time'] = date_list
sub_df_month['meter_id'] = meter_id

## Feature Engineering

In [None]:
for df in [test_df, sub_df]:
    df['time'] = pd.to_datetime(df['time'])
    df['year'] = df['time'].dt.year
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    df['hour'] = df['time'].dt.hour
    df['weekday'] = df['time'].dt.weekday

In [None]:
all_df = pd.concat([test_df, sub_df], axis=0, sort=False)
all_df['y_hr'] = all_df.groupby(['meter_id'])['y'].shift(24)

test_df_hour = all_df[:test_df.shape[0]]
sub_df_hour = all_df[test_df.shape[0]:]

In [None]:
all_df = pd.concat([test_df, sub_df_day], axis=0, sort=False)
all_df['y_hr'] = all_df.groupby(['meter_id'])['y'].shift(24 * 10)

test_df_day = all_df[:test_df.shape[0]]
sub_df_day = all_df[test_df.shape[0]:]

In [None]:
all_df = pd.concat([test_df, sub_df_month], axis=0, sort=False)
all_df['y_hr'] = all_df.groupby(['meter_id'])['y'].shift(24 * 153)

test_df_month = all_df[:test_df.shape[0]]
sub_df_month = all_df[test_df.shape[0]:]

In [None]:
del all_df; 
gc.collect()

## External data - weather 

## Make a Validation Set

In [None]:
train_hour = test_df[test_df['time'] < '2018-06-30 00:00:00'].reset_index(drop=True)
valid_hour = test_df[test_df['time'] >= '2018-06-30 00:00:00'].reset_index(drop=True)

In [None]:
train_day = test_df_day[test_df_day['time'] < '2018-06-21 00:00:00'].reset_index(drop=True)
valid_day = test_df_day[test_df_day['time'] >= '2018-06-21 00:00:00'].reset_index(drop=True)

In [None]:
train_month = test_df_month[test_df_month['time'] < '2018-01-01 00:00:00'].reset_index(drop=True)
valid_month = test_df_month[test_df_month['time'] >= '2018-01-01 00:00:00'].reset_index(drop=True)

## LightGBM - Hour

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [None]:
params = {'num_leaves': 20,
          'objective': 'regression',
          'max_depth': -1,
          'learning_rate': 0.001,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'mape',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47
         }

In [None]:
features = [c for c in train_hour.columns if c not in ['time','y']]
 

y = train_hour['y']
y_valid = valid_hour['y']

X = train_hour[features].reset_index(drop=True)
V = valid_hour[features].reset_index(drop=True)
sub = sub_df_hour[features].reset_index(drop=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

for col in tqdm_notebook(['meter_id']):
    if X[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(X[col].astype(str).values) + list(sub[col].astype(str).values))
        X[col] = le.transform(list(X[col].astype(str).values))
        sub[col] = le.transform(list(sub[col].astype(str).values))   
        V[col] = le.transform(list(V[col].astype(str).values))   

In [None]:
from time import time

mape = list()
feature_importances = pd.DataFrame()
feature_importances['feature'] = X.columns

training_start_time = time()
start_time = time()
    
trn_data = lgb.Dataset(X, label=y, categorical_feature = ['meter_id'])
val_data = lgb.Dataset(V, label=y_valid, categorical_feature = ['meter_id'])
clf = lgb.train(params, trn_data, 2500, valid_sets = [trn_data, val_data], 
                    verbose_eval=100, early_stopping_rounds=50)
    
mape.append(clf.best_score['valid_1']['mape'])
    
print('-' * 30)
print('Training has finished.')
print('Total training time is {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))
print('-' * 30)

In [None]:
best_iter = clf.best_iteration
clf = lgb.LGBMRegressor(**params, num_boost_round=best_iter)
clf.fit(pd.concat([X,V],axis=0), pd.concat([y,y_valid],axis=0))

In [None]:
sub_df_hour['y'] = clf.predict(sub)

In [None]:
submission_hour = pd.DataFrame()
meter_id_list = []
output_list = []
for meter_id in tqdm_notebook(sub_df_hour['meter_id'].unique()):
    meter_id_list.append(meter_id)
    output_list.append(sub_df_hour[sub_df_hour['meter_id']==meter_id]['y'].values)
    
submission_hour['merter_id'] = meter_id_list
submission_hour = pd.concat([submission_hour, pd.DataFrame(output_list)], axis=1)
submission_hour.columns = np.append(['meter_id'], ['X2018_7_1_'+str(i+1)+'h' for i in range(24)])
submission_hour['meter_id_2'] = submission_hour['meter_id'].apply(lambda x: x[1:]).astype(int)
submission_hour = submission_hour.sort_values(by='meter_id_2')
del submission_hour['meter_id_2']
submission_hour

## LightGBM - day

In [None]:
def create_new_columns(name,aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

aggs = {}
aggs['y'] = ['sum']
aggs['y_hr'] = ['sum','max','min','mean','var']
#aggs['기온(°C)'] = ['max','min','mean','var']
#aggs['풍속(m/s)'] = ['max','min','mean','var']

new_columns = create_new_columns('day',aggs)
train_day['date'] = train_day['time'].dt.date
train_day_group_df = train_day.groupby(['date','meter_id']).agg(aggs)
train_day_group_df.columns = new_columns
train_day_group_df.reset_index(drop=False,inplace=True)

valid_day['date'] = valid_day['time'].dt.date
valid_day_group_df = valid_day.groupby(['date','meter_id']).agg(aggs)
valid_day_group_df.columns = new_columns
valid_day_group_df.reset_index(drop=False,inplace=True)

sub_df_day['date'] = sub_df_day['time'].dt.date
sub_df_day_group_df = sub_df_day.groupby(['date','meter_id']).agg(aggs)
sub_df_day_group_df.columns = new_columns
sub_df_day_group_df.reset_index(drop=False,inplace=True)

In [None]:
features = [c for c in train_day_group_df.columns if c not in ['date','y','day_y_sum']]
 

y = train_day_group_df['day_y_sum']
y_valid = valid_day_group_df['day_y_sum']

X = train_day_group_df[features].reset_index(drop=True)
V = valid_day_group_df[features].reset_index(drop=True)
sub = sub_df_day_group_df[features].reset_index(drop=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

for col in tqdm_notebook(['meter_id']):
    if X[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(X[col].astype(str).values) + list(sub[col].astype(str).values))
        X[col] = le.transform(list(X[col].astype(str).values))
        sub[col] = le.transform(list(sub[col].astype(str).values))   
        V[col] = le.transform(list(V[col].astype(str).values))   

In [None]:
from time import time

mape = list()
feature_importances = pd.DataFrame()
feature_importances['feature'] = X.columns

training_start_time = time()
start_time = time()
    
trn_data = lgb.Dataset(X, label=y, categorical_feature = ['meter_id'])
val_data = lgb.Dataset(V, label=y_valid, categorical_feature = ['meter_id'])
clf = lgb.train(params, trn_data, 2500, valid_sets = [trn_data, val_data], 
                    verbose_eval=100, early_stopping_rounds=50)
    
mape.append(clf.best_score['valid_1']['mape'])
    
print('-' * 30)
print('Training has finished.')
print('Total training time is {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))
print('-' * 30)

In [None]:
best_iter = clf.best_iteration
clf = lgb.LGBMRegressor(**params, num_boost_round=best_iter)
clf.fit(pd.concat([X,V],axis=0), pd.concat([y,y_valid],axis=0))

In [None]:
sub_df_day_group_df['y'] = clf.predict(sub)

In [None]:
sub

In [None]:
submission_day = pd.DataFrame()
meter_id_list = []
output_list = []
for meter_id in tqdm_notebook(sub_df_day_group_df['meter_id'].unique()):
    meter_id_list.append(meter_id)
    output_list.append(sub_df_day_group_df[sub_df_day_group_df['meter_id']==meter_id]['y'].values)
    
submission_day['merter_id'] = meter_id_list
submission_day = pd.concat([submission_day, pd.DataFrame(output_list)], axis=1)
submission_day.columns = np.append(['meter_id'], ['X2018_7_'+str(i+1)+'_d' for i in range(10)])
submission_day['meter_id_2'] = submission_day['meter_id'].apply(lambda x: x[1:]).astype(int)
submission_day = submission_day.sort_values(by='meter_id_2').reset_index(drop=True)
del submission_day['meter_id_2']
submission_day

## LightGBM - month

In [None]:
def create_new_columns(name,aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

aggs = {}
aggs['y'] = ['sum']
aggs['y_hr'] = ['sum','max','min','mean','var']
#aggs['기온(°C)'] = ['max','min','mean','var']
#aggs['풍속(m/s)'] = ['max','min','mean','var']

new_columns = create_new_columns('month',aggs)
train_month_group_df = train_month.groupby(['year_month','meter_id']).agg(aggs)
train_month_group_df.columns = new_columns
train_month_group_df.reset_index(drop=False,inplace=True)

valid_month_group_df = valid_month.groupby(['year_month','meter_id']).agg(aggs)
valid_month_group_df.columns = new_columns
valid_month_group_df.reset_index(drop=False,inplace=True)

sub_df_month_group_df = sub_df_month.groupby(['year_month','meter_id']).agg(aggs)
sub_df_month_group_df.columns = new_columns
sub_df_month_group_df.reset_index(drop=False,inplace=True)

In [None]:
features = [c for c in train_month_group_df.columns if c not in ['date','y','month_y_sum','year_month']]
 

y = train_month_group_df['month_y_sum']
y_valid = valid_month_group_df['month_y_sum']

X = train_month_group_df[features].reset_index(drop=True)
V = valid_month_group_df[features].reset_index(drop=True)
sub = sub_df_month_group_df[features].reset_index(drop=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

for col in tqdm_notebook(['meter_id']):
    if X[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(X[col].astype(str).values) + list(sub[col].astype(str).values))
        X[col] = le.transform(list(X[col].astype(str).values))
        sub[col] = le.transform(list(sub[col].astype(str).values))   
        V[col] = le.transform(list(V[col].astype(str).values))   

In [None]:
params = {'num_leaves': 20,
          'objective': 'regression',
          'max_depth': 3,
          'learning_rate': 0.0001,
          "boosting_type": "dart",
          "metric": 'mape',
          "verbosity": -1,
          'random_state': 47
         }

In [None]:
from time import time

mape = list()
feature_importances = pd.DataFrame()
feature_importances['feature'] = X.columns

training_start_time = time()
start_time = time()
    
trn_data = lgb.Dataset(X, label=y, categorical_feature = ['meter_id'])
val_data = lgb.Dataset(V, label=y_valid, categorical_feature = ['meter_id'])
clf = lgb.train(params, trn_data, 2500, valid_sets = [trn_data, val_data], 
                    verbose_eval=100, early_stopping_rounds=50)
    
mape.append(clf.best_score['valid_1']['mape'])
    
print('-' * 30)
print('Training has finished.')
print('Total training time is {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))
print('-' * 30)

In [None]:
best_iter = clf.best_iteration
clf = lgb.LGBMRegressor(**params, num_boost_round=best_iter)
clf.fit(pd.concat([X,V],axis=0), pd.concat([y,y_valid],axis=0))

In [None]:
sub_df_month_group_df['y'] = clf.predict(sub)

In [None]:
submission_month = pd.DataFrame()
meter_id_list = []
output_list = []
for meter_id in tqdm_notebook(sub_df_month_group_df['meter_id'].unique()):
    meter_id_list.append(meter_id)
    output_list.append(sub_df_month_group_df[sub_df_month_group_df['meter_id']==meter_id]['y'].values)
    
submission_month['merter_id'] = meter_id_list
submission_month = pd.concat([submission_month, pd.DataFrame(output_list)], axis=1)
submission_month.columns = np.append(['meter_id'], ['X2018_'+str(i+7)+'_m' for i in range(5)])
submission_month['meter_id_2'] = submission_month['meter_id'].apply(lambda x: x[1:]).astype(int)
submission_month = submission_month.sort_values(by='meter_id_2').reset_index(drop=True)
del submission_month['meter_id_2']
submission_month

## submission

In [None]:
submission_total = pd.merge(submission_hour, submission_day, how='left', on='meter_id')
submission_total = pd.merge(submission_total, submission_month, how='left', on='meter_id')
submission_total.to_csv("submission_total.csv",index=False)