In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import os
import glob
import random

import warnings
warnings.filterwarnings("ignore")

data_path = '/content/drive/MyDrive/SolarGen/'

## Baseline

In [None]:
train = pd.read_csv(data_path + './data/train/train.csv')
submission = pd.read_csv(data_path + './data/sample_submission.csv')

In [None]:
import sys
import matplotlib.pyplot as plt
from sklearn.preprocessing import (
    MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer, 
    MaxAbsScaler, PowerTransformer
)

class ScalingFactor:
    def __init__(self, scaler, features):
        self.scaler = scaler
        self.features = features

def classify_data_byDHI(data):
    '''
    Classify data into two cases whether DHI is 0 or not
    '''
    zero_dhi = data[data['DHI'] < sys.float_info.epsilon]
    nonzero_dhi = data[(data['DHI'] >= sys.float_info.epsilon)]

    return zero_dhi, nonzero_dhi

def scaling(data, scaler, features, is_train=True):

    temp = data.copy()

    target_cols = features
    temp = temp[target_cols]
    if is_train: 
        scaler.fit(temp)
    scaled = scaler.transform(temp)

    data[target_cols] = scaled

    return data    

def preprocess_data(data, scale_factors, is_train=True):
    
    temp = data.copy()

    temp = temp[['Hour', 'TARGET', 'DHI', 'DNI', 'WS', 'RH', 'T']]

    if is_train==True:          
    
        temp['Target1'] = temp['TARGET'].shift(-48).fillna(method='ffill')
        temp['Target2'] = temp['TARGET'].shift(-48*2).fillna(method='ffill')
        temp = temp.dropna()
        temp = temp.iloc[:-96]

    elif is_train==False:
                              
        temp = temp.iloc[-48:]
        temp.index -= 48*6

    zero_dhi, nonzero_dhi = classify_data_byDHI(temp)

    for factor in scale_factors:
        zero_dhi = scaling(zero_dhi, factor.scaler, factor.features, is_train)
        nonzero_dhi = scaling(nonzero_dhi, factor.scaler, factor.features, is_train)

    return zero_dhi, nonzero_dhi

scale_factors = []
# scale_factors.append(ScalingFactor(PowerTransformer(), ['DHI', 'DNI']))
# scale_factors.append(ScalingFactor(QuantileTransformer(output_distribution='normal'), ['DNI']))
# scale_factors.append(ScalingFactor(StandardScaler(), ['RH', 'T']))
# scale_factors.append(ScalingFactor(MinMaxScaler(), ['DHI', 'DNI', 'WS', 'RH', 'T']))

zero_dhi, nonzero_dhi = preprocess_data(train, scale_factors)
print(zero_dhi.shape, nonzero_dhi.shape)

In [None]:
nonzero_dhi.iloc[:, 1:].plot(subplots=True, layout=(1,8), figsize=(30,4))
nonzero_dhi.iloc[:, 1:].hist(bins = 100, layout=(1,8), figsize=(30,4))

In [None]:
days = 48 * 2
features = ['TARGET', 'DHI', 'DNI']
for feature in features:
    fig, ax = plt.subplots(figsize=(16,3))
    ax.scatter(zero_dhi.loc[:days].index, zero_dhi.loc[:days][feature], color='k', label="DHI = 0")
    ax.scatter(nonzero_dhi.loc[:days].index, nonzero_dhi.loc[:days][feature], color='y', label="DHI != 0")
    ax.legend()
    plt.xlabel('Index', fontsize=16)
    plt.ylabel(feature, fontsize=16)

In [None]:
f = plt.figure(figsize=(12,4))
ax = f.add_subplot(111)
ax.hist(zero_dhi['Hour'], [i for i in range(24)], rwidth=0.8, color='k')
ax.hist(nonzero_dhi['Hour'], [i for i in range(24)], rwidth=0.8, color='y', alpha=0.8)
ax.set_xticks([i for i in range(24)])
plt.show()

In [None]:
X_zero_dhi_test = []
X_nonzero_dhi_test = []

for i in range(81):
    file_path = data_path + './data/test/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    zero, nonzero = preprocess_data(temp, scale_factors, is_train=False)
    zero.index += i * 100
    nonzero.index += i * 100
    X_zero_dhi_test.append(zero)
    X_nonzero_dhi_test.append(nonzero)

X_zero_dhi_test_total = pd.concat(X_zero_dhi_test)
X_nonzero_dhi_test_total = pd.concat(X_nonzero_dhi_test)

X_nonzero_dhi_test[0].iloc[:, 1:].plot(subplots=True, layout=(1,6), figsize=(30,4))
X_nonzero_dhi_test_total.iloc[:, 1:].hist(bins = 100, layout=(1,6), figsize=(30,4))

In [None]:
from sklearn.model_selection import train_test_split
X_train_1, X_valid_1, Y_train_1, Y_valid_1 = train_test_split(nonzero_dhi.iloc[:, :-2], nonzero_dhi.iloc[:, -2], test_size=0.3, random_state=0)
X_train_2, X_valid_2, Y_train_2, Y_valid_2 = train_test_split(nonzero_dhi.iloc[:, :-2], nonzero_dhi.iloc[:, -1], test_size=0.3, random_state=0)

In [None]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [None]:
from lightgbm import LGBMRegressor

# Get the model and the predictions in (a) - (b)
def LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test):
    
    # (a) Modeling  
    model = LGBMRegressor(objective='quantile', alpha=q,
                         n_estimators=10000, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)                   
                         
                         
    model.fit(X_train, Y_train, eval_metric = ['quantile'], 
          eval_set=[(X_valid, Y_valid)], early_stopping_rounds=300, verbose=500)

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model

In [None]:
# Target 예측
import sys
from io import StringIO

def get_best_quantiles(data):

    best_indexes = []
    best_quantiles = {}

    lines = data.splitlines()

    for i, line in enumerate(lines):
        if "best" in line:
            best_indexes.append(i+1)

    for i in range(9):
        key = (i+1) * 0.1
        val = float(lines[best_indexes[i]].split(":")[-1].strip())
        best_quantiles[key] = val

    return best_quantiles

def train_data(X_train, Y_train, X_valid, Y_valid, X_test):

    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()
    
    old_stdout = sys.stdout
    sys.stdout = my_stdout = StringIO()
    for q in quantiles:
        print(q)
        pred_result = pd.Series()

        pred , model = LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test)
        LGBM_models.append(model)
        pred.index = X_nonzero_dhi_test_total.index

        zero_pred = pd.Series([0.0 for i in range(len(X_zero_dhi_test_total))], X_zero_dhi_test_total.index)
        pred_result = pd.concat([pred, zero_pred], axis=0)
        pred_result = pred_result.sort_index()

        LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred_result],axis=1)
    
    sys.stdout = old_stdout
    results_out = my_stdout.getvalue()
    LGBM_losses = get_best_quantiles(results_out)

    LGBM_actual_pred.columns=quantiles
    
    return LGBM_models, LGBM_actual_pred, LGBM_losses

In [None]:
# Target1
models_1, results_1, loss_1 = train_data(X_train_1, Y_train_1, X_valid_1, Y_valid_1, X_nonzero_dhi_test_total)
results_1.sort_index()[:48]

In [None]:
results_1.iloc[:48]

In [None]:
for key, val in loss_1.items():
    print("[%.1f]"%key + "\t%f"%val)
print("[avg]\t%.6f"%np.average(list(loss_1.values())))

In [None]:
# Target2
models_2, results_2, loss_2 = train_data(X_train_2, Y_train_2, X_valid_2, Y_valid_2, X_nonzero_dhi_test_total)
results_2.sort_index()[:48]

In [None]:
for key, val in loss_2.items():
    print("[%.1f]"%key + "\t%f"%val)
print("[avg]\t%.6f"%np.average(list(loss_2.values())))

In [None]:
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = results_1.values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = results_2.values
submission

In [None]:
submission.iloc[:48]

In [None]:
# submission.to_csv(data_path + './data/submission.csv', index=False)