In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from datetime import timedelta, time
from tqdm import tqdm
# pandas max row 
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

In [6]:
from warnings import filterwarnings
filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import seaborn as sns
%matplotlib inline
import statsmodels.api as sm

In [10]:
tot_df = pd.read_csv('/content/drive/MyDrive/KDT/tot_df.csv', names = ['forecast_time', 'solar_result', 'temp', 'rain_r', 'hum', 'wind', 'wins','sky'])
tot_df = tot_df[1:]
tot_df['forecast_time'] = pd.to_datetime(tot_df['forecast_time'], format = '%Y%m%d%H')
data = tot_df.iloc[6:]
data.head()

FileNotFoundError: ignored

#01. 학습 데이터 전처리

In [None]:
data['hour'] = tot_df['forecast_time'].apply(lambda x : x.hour)
data['year'] = tot_df['forecast_time'].apply(lambda x : x.year)
data['month'] = tot_df['forecast_time'].apply(lambda x : x.month)
data['day'] = tot_df['forecast_time'].apply(lambda x : x.day)

In [None]:
dataset = data.loc[:,'solar_result' :].astype('float')
dataset.isnull().sum()

In [None]:
train = data.iloc[:round(len(data)*0.7)]
test = data.iloc[round(len(data)*0.7):]

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(data=dataset.corr(), annot=True,
fmt='.2f', linewidths=.5, cmap='Blues')

In [None]:
dataset.columns

In [None]:
sns.pairplot(dataset, x_vars=['temp', 'rain_r', 'hum', 'wind', 'wins', 'sky', 'hour',
       'year', 'month', 'day'],y_vars=['solar_result'])

# LR

In [None]:
from statsmodels.formula.api import ols
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
scaler.fit(dataset)

In [None]:
train = dataset.iloc[:round(len(dataset)*0.7)]
test = dataset.iloc[round(len(dataset)*0.7):]

In [None]:
train = train.iloc[:round(len(train)*0.85)]
validation = train.iloc[round(len(train)*0.85):]

In [None]:
X = train.loc[:,'temp' :]
y = train.loc[:,'solar_result']
res = ols('y~X', data = train).fit()

In [None]:
X.columns

In [None]:
res.summary() #wind, wins, day 제외

In [None]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split

In [None]:
lm = linear_model.LinearRegression()
model = lm.fit(X, y)
X_test = validation.loc[:,'temp' :]
predictions = lm.predict(X_test)
predictions[0:5]

In [None]:
plt.figure(figsize=(12,12))
y_test = validation.loc[:,'solar_result']
plt.scatter(y_test, predictions)
plt.xlabel("True Values")
plt.ylabel("Predictions")

In [None]:
model.score(X_test, y_test)

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

model_ridge_alpha_10 = Ridge(alpha=10).fit(X, y)

print("훈련 세트 점수: {:f}".format(model_ridge_alpha_10.score(X, y)))
print("테스트 세트 점수: {:f}".format(model_ridge_alpha_10.score(X_test, y_test)))

#ensemble

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# 모델생성과 학습
model_gradient = GradientBoostingRegressor(n_estimators=500)
model_gradient.fit(X, y)

# 훈련과 테스트 스코어 비교
print("훈련 세트 점수: {:f}".format(model_gradient.score(X, y)))
print("테스트 세트 점수: {:f}".format(model_gradient.score(X_test, y_test)))

In [None]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

SupportVectorRegModel = make_pipeline(StandardScaler(), SVR(C=1000.0, epsilon=0.2))
SupportVectorRegModel.fit(X, y)

# 훈련과 테스트 스코어 비교
print("훈련 세트 점수: {:f}".format(SupportVectorRegModel.score(X, y)))
print("테스트 세트 점수: {:f}".format(SupportVectorRegModel.score(X_test, y_test)))

In [None]:
predictions = model_gradient.predict(X_test)
list = []
for i in range(len(y_train), len(y_train)+len(y_test)):
    list.append(i)

plt.figure(figsize=(40,12))
plt.xlabel('index', fontsize=18)
plt.ylabel('발전량', fontsize=18)
plt.plot(list, y_test, color='blue', marker='^', label='실제 발전량')
plt.plot(list, predictions, color='red', marker='o', label='예상 발전량')
plt.legend(fontsize='xx-large')
# plt.savefig('./predictions_1.png') # test = 0.1
# plt.savefig('./predictions_2.png') # test = 0.2
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor
# 모델생성과 학습
model_randomForest = RandomForestRegressor(n_estimators=10)
model_randomForest.fit(X, y)

# 훈련과 테스트 스코어 비교
print("훈련 세트 점수: {:f}".format(model_randomForest.score(X, y)))
print("테스트 세트 점수: {:f}".format(model_randomForest.score(X_test, y_test)))

In [None]:
predictions = model_randomForest.predict(X_test)
plt.figure(figsize=(12,12))
plt.scatter(y_test, predictions)
plt.xlabel("True Values")
plt.ylabel("Predictions")

#LGB

LightGBM Custom Metric

In [None]:
train = train.iloc[:round(len(data)*0.85)]
validation = train.iloc[round(len(data)*0.85):]

In [None]:
def nmae_10(y_pred, dataset):
    y_true = dataset.get_label()
    
    absolute_error = abs(y_true - y_pred)
    absolute_error /= capacity
    
    target_idx = np.where(y_true>=capacity*0.1)
    
    nmae = 100 * absolute_error[target_idx].mean()
    
    return 'score', nmae, False

Validation Metric

In [None]:
def sola_nmae(answer, pred):
    absolute_error = np.abs(answer - pred)
    
    absolute_error /= capacity
    
    target_idx = np.where(answer>=capacity*0.1)
    
    nmae = 100 * absolute_error[target_idx].mean()
    
    return nmae

LightGBM Hyperparameter

In [None]:
params = {
    'learning_rate': 0.01,
    'objective': 'regression',
    'metric':'mae',
    'seed':42
}

In [None]:
import lightgbm as lgb
from xgboost import plot_importance

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings

In [None]:
train_x = train.iloc[:, 2:]
train_y = train.loc[:,'solar_result']
val_x = validation.iloc[:, 2:]
val_y = validation.loc[:,'solar_result']

In [None]:
train_y

In [None]:
train_dataset = lgb.Dataset(train_x, train_y)
val_dataset = lgb.Dataset(val_x, val_y)
capacity = 1000
dangjin_floating_model = lgb.train(params, train_dataset, 10000, val_dataset, feval=nmae_10, verbose_eval=500, early_stopping_rounds=100)