In [22]:
import pandas as pd
import numpy as np
import os
import glob
import random

import warnings
warnings.filterwarnings("ignore")

## Baseline

In [23]:
train = pd.read_csv('./data/train/train.csv')

In [24]:
submission = pd.read_csv('./data/sample_submission.csv')

In [25]:
def cal_ghi(df):
    df['GHI']=0
    for i in range(len(df)):
        hour = df.loc[i,'Hour']
        if 8 <= hour <= 16:
            degree = (hour-7) * np.pi/18
            GHI = df.loc[i,'DNI'] + df.loc[i,'DHI']*(np.cos(90-degree))  # DHI +DNI*cos(Θ)
            if GHI<0:
                GHI=0
            df.loc[i,'GHI']=GHI

In [26]:
# data를 넣어주면 
def preprocess_data(data, is_train=True):
    
    temp = data.copy()
    temp = temp[['Hour', 'TARGET', 'DHI', 'DNI', 'WS', 'RH', 'T']]

    if is_train==True:          
    
        temp['Target1'] = temp['TARGET'].shift(-48).fillna(method='ffill')   # 뒤에서 48개의 값을 지우고 가장 마지막 값으로 채운다
        temp['Target2'] = temp['TARGET'].shift(-48*2).fillna(method='ffill') # 뒤에서 96개의 값을 지우고 가장 마지막 값으로 채운다
        temp = temp.dropna()  # 결측치 제거
        
        return temp.iloc[:-96] # 아 이건 예측값이 0이 되버리니까 제거해버리는구나 ㅇㅋㅇㅋ

    elif is_train==False:
        
        temp = temp[['Hour', 'TARGET', 'DHI', 'DNI', 'WS', 'RH', 'T']]
                              
        return temp.iloc[-48:, :]


df_train = preprocess_data(train)
df_train.iloc[:48]

Unnamed: 0,Hour,TARGET,DHI,DNI,WS,RH,T,Target1,Target2
0,0,0.0,0,0,1.5,69.08,-12,0.0,0.0
1,0,0.0,0,0,1.5,69.06,-12,0.0,0.0
2,1,0.0,0,0,1.6,71.78,-12,0.0,0.0
3,1,0.0,0,0,1.6,71.75,-12,0.0,0.0
4,2,0.0,0,0,1.6,75.2,-12,0.0,0.0
5,2,0.0,0,0,1.5,69.29,-11,0.0,0.0
6,3,0.0,0,0,1.5,72.56,-11,0.0,0.0
7,3,0.0,0,0,1.4,72.55,-11,0.0,0.0
8,4,0.0,0,0,1.3,74.62,-11,0.0,0.0
9,4,0.0,0,0,1.3,74.61,-11,0.0,0.0


In [27]:
cal_ghi(df_train)

In [28]:
df_train = df_train[['TARGET', 'WS', 'RH', 'T','Target1','Target2']]

In [29]:
df_test = []

for i in range(81):
    file_path = './data/test/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    temp = preprocess_data(temp, is_train=False)
    df_test.append(temp)

X_test = pd.concat(df_test)
X_test.shape

(3888, 7)

In [32]:
df_train.columns

Index(['TARGET', 'WS', 'RH', 'T', 'Target1', 'Target2'], dtype='object')

In [33]:
X_test

Unnamed: 0,TARGET,WS,RH,T
288,0.0,0.8,80.92,-2.8
289,0.0,0.9,81.53,-2.9
290,0.0,1.0,79.91,-3.0
291,0.0,0.9,79.91,-3.0
292,0.0,0.9,77.20,-3.0
...,...,...,...,...
331,0.0,0.8,63.35,13.7
332,0.0,0.7,64.82,13.1
333,0.0,0.7,66.10,12.8
334,0.0,0.6,67.64,12.4


In [35]:
from sklearn.model_selection import train_test_split
X_train_1, X_valid_1, Y_train_1, Y_valid_1 = train_test_split(df_train.iloc[:, :-2], df_train.iloc[:, -2], test_size=0.3, random_state=0)
X_train_2, X_valid_2, Y_train_2, Y_valid_2 = train_test_split(df_train.iloc[:, :-2], df_train.iloc[:, -1], test_size=0.3, random_state=0)

In [37]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [38]:
from lightgbm import LGBMRegressor

# Get the model and the predictions in (a) - (b)
def LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test):
    
    # (a) Modeling  
    model = LGBMRegressor(objective='quantile', alpha=q,
                         n_estimators=10000, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)                   
                         
                         
    model.fit(X_train, Y_train, eval_metric = ['quantile'], 
          eval_set=[(X_valid, Y_valid)], early_stopping_rounds=300, verbose=500)

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model

In [39]:
# Target 예측

def train_data(X_train, Y_train, X_valid, Y_valid, X_test):

    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()

    for q in quantiles:
        print(q)
        pred , model = LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test)
        LGBM_models.append(model)
        LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1)

    LGBM_actual_pred.columns=quantiles
    
    return LGBM_models, LGBM_actual_pred

In [35]:
# Target1
models_1, results_1 = train_data(X_train_1, Y_train_1, X_valid_1, Y_valid_1, X_test)
results_1.sort_index()[:48]

0.1
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[91]	valid_0's quantile: 1.45668
0.2
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[129]	valid_0's quantile: 2.36278
0.3
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[181]	valid_0's quantile: 2.81087
0.4
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.94861
Early stopping, best iteration is:
[222]	valid_0's quantile: 2.93696
0.5
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.85568
Early stopping, best iteration is:
[261]	valid_0's quantile: 2.84832
0.6
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.61713
Early stopping, best iteration is:
[277]	valid_0's quantile: 2.61197
0.7
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantil

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# Target2
models_2, results_2 = train_data(X_train_2, Y_train_2, X_valid_2, Y_valid_2, X_test)
results_2.sort_index()[:48]

0.1
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[136]	valid_0's quantile: 1.47208
0.2
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.40072
Early stopping, best iteration is:
[253]	valid_0's quantile: 2.38952
0.3
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.87877
Early stopping, best iteration is:
[226]	valid_0's quantile: 2.86344
0.4
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 3.0084
Early stopping, best iteration is:
[238]	valid_0's quantile: 2.99554
0.5
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.93617
Early stopping, best iteration is:
[281]	valid_0's quantile: 2.92691
0.6
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.70794
Early stopping, best iteration is:
[289]	valid_0's quantile: 2.7058
0.7
Training until val

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = results_1.sort_index().values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = results_2.sort_index().values
submission

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
0,0.csv_Day7_0h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.csv_Day7_0h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.csv_Day7_1h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.csv_Day7_1h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.csv_Day7_2h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7771,80.csv_Day8_21h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7772,80.csv_Day8_22h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7773,80.csv_Day8_22h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7774,80.csv_Day8_23h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
submission.to_csv('./data/t,rh,ws,target.csv', index=False)