In [1]:
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns  
import warnings
import numpy as np
import matplotlib.font_manager as fm

from matplotlib import font_manager, rc
from scipy.stats import skew, kurtosis
from factor_analyzer import FactorAnalyzer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sktime.forecasting.model_selection import temporal_train_test_split
from sklearn.metrics import mean_squared_error

font_path = 'C:/WINDOWS/Fonts/Hancom Gothic Bold.ttf'
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

warnings.filterwarnings('ignore')

In [2]:
path = "C:/Users/USER/Desktop/OIBC2023_data/OIBC2023_data/{}"

incentive = pd.read_csv(path.format('incentive.csv'))
pred = pd.read_csv(path.format('pred.csv'))
weather_actual = pd.read_csv(path.format('weather_actual.csv'))
weather_forecast = pd.read_csv(path.format('weather_forecast.csv'))
gens = pd.read_csv(path.format('gens.csv'))

In [3]:
incentive['time'] = pd.to_datetime(incentive['time'])
pred['time'] = pd.to_datetime(pred['time'])
weather_actual['time'] = pd.to_datetime(weather_actual['time'])
weather_forecast['time'] = pd.to_datetime(weather_forecast['time'])
gens['time'] = pd.to_datetime(gens['time'])

In [4]:
weather_r1 = weather_forecast[weather_forecast['round'] == 1]
weather_r2 = weather_forecast[weather_forecast['round'] == 2]

In [5]:
weather_forecast['hour'] = weather_forecast['time'].dt.hour
weather_forecast['day'] = weather_forecast['time'].dt.day
weather_forecast['month'] = weather_forecast['time'].dt.month
weather_forecast['date'] = weather_forecast['time'].dt.date
weather_forecast['weekday'] = weather_forecast['time'].dt.weekday

gens['hour'] = gens['time'].dt.hour
gens['day'] = gens['time'].dt.day
gens['month'] = gens['time'].dt.month
gens['date'] = gens['time'].dt.date
gens['weekday'] = gens['time'].dt.weekday

In [6]:
model1 = pred[pred['model_id'] == 0]
model2 = pred[pred['model_id'] == 1]
model3 = pred[pred['model_id'] == 2]
model4 = pred[pred['model_id'] == 3]
model5 = pred[pred['model_id'] == 4]

model1_r1 = model1[model1['round'] == 1]
model2_r1 = model2[model2['round'] == 1]
model3_r1 = model3[model3['round'] == 1]
model4_r1 = model4[model4['round'] == 1]
model5_r1 = model5[model5['round'] == 1]

model1_r2 = model1[model1['round'] == 2]
model2_r2 = model2[model2['round'] == 2]
model3_r2 = model3[model3['round'] == 2]
model4_r2 = model4[model4['round'] == 2]
model5_r2 = model5[model5['round'] == 2]

model1_r1 = pd.merge(model1_r1, weather_r1, on='time')
model2_r1 = pd.merge(model2_r1, weather_r1, on='time')
model3_r1 = pd.merge(model3_r1, weather_r1, on='time')
model4_r1 = pd.merge(model4_r1, weather_r1, on='time')
model5_r1 = pd.merge(model5_r1, weather_r1, on='time')

model1_r2 = pd.merge(model1_r2, weather_r2, on='time')
model2_r2 = pd.merge(model2_r2, weather_r2, on='time')
model3_r2 = pd.merge(model3_r2, weather_r2, on='time')
model4_r2 = pd.merge(model4_r2, weather_r2, on='time')
model5_r2 = pd.merge(model5_r2, weather_r2, on='time')

model1_r1.rename(columns = {'amount': 'pred'}, inplace=True)
model2_r1.rename(columns = {'amount': 'pred'}, inplace=True)
model3_r1.rename(columns = {'amount': 'pred'}, inplace=True)
model4_r1.rename(columns = {'amount': 'pred'}, inplace=True)
model5_r1.rename(columns = {'amount': 'pred'}, inplace=True)

model1_r2.rename(columns = {'amount': 'pred'}, inplace=True)
model2_r2.rename(columns = {'amount': 'pred'}, inplace=True)
model3_r2.rename(columns = {'amount': 'pred'}, inplace=True)
model4_r2.rename(columns = {'amount': 'pred'}, inplace=True)
model5_r2.rename(columns = {'amount': 'pred'}, inplace=True)

In [7]:
model1_r1

Unnamed: 0,round_x,time,model_id,pred,round_y,cloud,temp,humidity,ground_press,wind_speed,wind_dir,rain,snow,dew_point,vis,uv_idx,azimuth,elevation
0,1,2022-06-19 01:00:00+09:00,0,0.0,1,6.0,20.03,93.0,1009.0,3.01,162.0,0.0,0.0,18.3333,16.0934,0.0,6.70428,-31.5296
1,1,2022-06-19 02:00:00+09:00,0,0.0,1,7.0,19.88,95.0,1009.0,3.16,159.0,0.0,0.0,18.3333,16.0934,0.0,22.19640,-28.4404
2,1,2022-06-19 03:00:00+09:00,0,0.0,1,17.0,19.99,96.0,1008.0,2.92,161.0,0.0,0.0,18.3333,16.0934,0.0,35.91940,-22.4374
3,1,2022-06-19 04:00:00+09:00,0,0.0,1,100.0,20.19,96.0,1008.0,2.79,157.0,0.0,0.0,17.7778,16.0934,0.0,47.55770,-14.2214
4,1,2022-06-19 05:00:00+09:00,0,0.0,1,100.0,20.34,95.0,1008.0,2.74,156.0,0.0,0.0,18.3333,16.0934,0.0,57.37820,-4.4447
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11611,1,2023-10-15 20:00:00+09:00,0,0.0,1,0.0,18.50,72.0,1015.0,5.49,325.0,0.0,0.0,12.7778,16.0934,0.0,277.46500,-25.3792
11612,1,2023-10-15 21:00:00+09:00,0,0.0,1,0.0,18.60,73.0,1015.0,5.36,313.0,0.0,0.0,12.7778,16.0934,0.0,287.67900,-37.4097
11613,1,2023-10-15 22:00:00+09:00,0,0.0,1,0.0,18.64,73.0,1015.0,5.77,306.0,0.0,0.0,12.7778,16.0934,0.0,301.00700,-48.6552
11614,1,2023-10-15 23:00:00+09:00,0,0.0,1,0.0,18.70,70.0,1015.0,5.91,302.0,0.0,0.0,12.2222,16.0934,0.0,320.43400,-58.0565


In [8]:
def error(df):
    df['error'] = abs(df['real'] - df['pred'])
    return df

In [9]:
error_round1 = pd.merge(gens, pred[pred['round'] == 1], on='time')
error_round2 = pd.merge(gens, pred[pred['round'] == 2], on='time')

error_round1.rename(columns = {'amount_x' : 'real', 'amount_y':'pred'}, inplace = True)
error_round2.rename(columns = {'amount_x' : 'real', 'amount_y':'pred'}, inplace = True)

error_round1 = error(error_round1)
error_round2 = error(error_round2)

In [10]:
# 모든 시간대별 가장 낮은 error값을 기록한 모델을 선별

# 중복을 제거하기 위해서 error 값이 0과 1인 행은 제거
error_round1 = error_round1[(error_round1['error'] != 0) & (error_round1['error'] != 1)]
error_round2 = error_round2[(error_round2['error'] != 0) & (error_round2['error'] != 1)]

# 각 round별로 error값이 최소인 행들을 추출하여 새로운 df를 생성
min_error1 = error_round1.loc[error_round1.groupby('time')['error'].idxmin()]
min_error2 = error_round2.loc[error_round2.groupby('time')['error'].idxmin()]

In [11]:
weather_forecast10 = weather_forecast[weather_forecast['round'] == 1]
weather_forecast17 = weather_forecast[weather_forecast['round'] == 2]

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
value = ['cloud', 'temp', 'humidity', 'ground_press',
       'wind_speed', 'wind_dir', 'rain', 'snow', 'dew_point', 'vis', 'uv_idx',
       'azimuth', 'elevation']

+ 군집에 따른 시간대별 모델 분포 확인

### 기상 조건, 시간, 달, 요일을 설명변수로 두고 model id를 예측하는 모델을 만들어보자

In [29]:
min_error1

Unnamed: 0,time,real,hour,day,month,date,weekday,round,model_id,pred,error
33,2022-06-19 07:00:00+09:00,5.0,7,19,6,2022-06-19,6,1,3,2.536960,2.463040
39,2022-06-19 08:00:00+09:00,14.0,8,19,6,2022-06-19,6,1,4,12.995900,1.004100
44,2022-06-19 09:00:00+09:00,22.0,9,19,6,2022-06-19,6,1,4,22.877900,0.877900
49,2022-06-19 10:00:00+09:00,18.0,10,19,6,2022-06-19,6,1,4,23.641100,5.641100
52,2022-06-19 11:00:00+09:00,59.0,11,19,6,2022-06-19,6,1,2,45.831400,13.168600
...,...,...,...,...,...,...,...,...,...,...,...
58031,2023-10-15 15:00:00+09:00,54.0,15,15,10,2023-10-15,6,1,1,49.597200,4.402800
58035,2023-10-15 16:00:00+09:00,47.0,16,15,10,2023-10-15,6,1,0,38.211800,8.788200
58041,2023-10-15 17:00:00+09:00,28.0,17,15,10,2023-10-15,6,1,1,27.272900,0.727100
58048,2023-10-15 18:00:00+09:00,5.0,18,15,10,2023-10-15,6,1,3,5.105960,0.105960


In [40]:
round1 = pd.merge(min_error1[['time', 'model_id', 'pred', 'error', 'real']], weather_forecast10, on='time')
round2 = pd.merge(min_error2[['time', 'model_id', 'pred', 'error', 'real']], weather_forecast17, on='time')

In [41]:
round1

Unnamed: 0,time,model_id,pred,error,real,round,cloud,temp,humidity,ground_press,...,dew_point,vis,uv_idx,azimuth,elevation,hour,day,month,date,weekday
0,2022-06-19 07:00:00+09:00,3,2.536960,2.463040,5.0,1,100.0,20.65,94.0,1009.0,...,19.4444,16.0934,0.0,73.5196,17.946100,7,19,6,2022-06-19,6
1,2022-06-19 08:00:00+09:00,4,12.995900,1.004100,14.0,1,100.0,20.84,93.0,1008.0,...,19.4444,16.0934,1.0,80.8871,29.953600,8,19,6,2022-06-19,6
2,2022-06-19 09:00:00+09:00,4,22.877900,0.877900,22.0,1,100.0,20.92,93.0,1008.0,...,18.8889,16.0934,1.0,88.6491,42.213400,9,19,6,2022-06-19,6
3,2022-06-19 10:00:00+09:00,4,23.641100,5.641100,18.0,1,100.0,21.09,92.0,1008.0,...,18.3333,16.0934,3.0,98.0436,54.503300,10,19,6,2022-06-19,6
4,2022-06-19 11:00:00+09:00,2,45.831400,13.168600,59.0,1,100.0,21.51,89.0,1008.0,...,18.3333,16.0934,5.0,112.3420,66.401400,11,19,6,2022-06-19,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6399,2023-10-15 15:00:00+09:00,1,49.597200,4.402800,54.0,1,10.0,19.53,69.0,1012.0,...,13.3333,16.0934,2.0,229.0640,32.576700,15,15,10,2023-10-15,6
6400,2023-10-15 16:00:00+09:00,0,38.211800,8.788200,47.0,1,0.0,19.44,69.0,1013.0,...,12.7778,16.0934,1.0,241.3140,22.437400,16,15,10,2023-10-15,6
6401,2023-10-15 17:00:00+09:00,1,27.272900,0.727100,28.0,1,0.0,19.20,69.0,1013.0,...,12.7778,16.0934,1.0,251.3850,11.140000,17,15,10,2023-10-15,6
6402,2023-10-15 18:00:00+09:00,3,5.105960,0.105960,5.0,1,0.0,18.82,70.0,1014.0,...,12.2222,16.0934,0.0,260.2330,-0.809771,18,15,10,2023-10-15,6


In [42]:
col = ['cloud', 'temp', 'humidity',
       'ground_press', 'wind_speed', 'wind_dir', 'rain', 'snow', 'dew_point',
       'vis', 'uv_idx', 'azimuth', 'elevation', 'pred']

In [67]:
def make_df(df):
    a = round2.copy()
    time_range = a[-13:]['time']
    use_model = df[df['time'].isin(time_range)]
#     print(use_model)
    a[-13:]['pred'] = use_model['pred'].values
    
    return a

In [68]:
# one = round1.copy()
one = make_df(model1_r2)

In [69]:
scaler_ = MinMaxScaler()
scaler_.fit(round2[col])
one[col] = scaler_.transform(one[col])

In [70]:
x_col = ['hour', 'month', 'cloud', 'temp', 'humidity',
       'ground_press', 'wind_speed', 'wind_dir', 'rain', 'snow', 'dew_point',
       'vis', 'uv_idx', 'azimuth', 'elevation', 'pred']

In [71]:
scaler_.fit(round1[['real']])
one[['real']] = scaler_.transform(one[['real']])

In [72]:
def min_max_scaling_reverse(scaled_value, min_val, max_val):
    original_value = scaled_value * (max_val - min_val) + min_val
    return original_value

# XGB 모델

In [73]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit

In [74]:
param_grid = {
    'n_estimators': [100, 200, 300],  # 트리의 개수
    'learning_rate': [0.01, 0.1, 0.2],  # 학습률
    'max_depth': [3, 4, 5],  # 트리의 최대 깊이
    'subsample': [0.8, 0.9, 1.0],  # 샘플링 비율
}


In [75]:
X = one[x_col]
y = one.real
# # 데이터를 학습용과 테스트용으로 나눕니다
X_train, X_test, y_train, y_test = temporal_train_test_split(X, y, test_size=13)

xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

tscv = TimeSeriesSplit(n_splits=5)  # 5개의 시계열 데이터 분할을 사용

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=1)
grid_search.fit(X_train, y_train)  # X와 y는 시계열 데이터 및 해당 타겟 값입니다.

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [76]:
pred = grid_search.predict(X_test)
pred = min_max_scaling_reverse(pred, round1['pred'].min(), round1['pred'].max())

In [77]:
mean_squared_error(round2[-13:]['pred'], pred)

24.957235785074086

In [78]:
mean_squared_error(round2[-13:]['real'], round2[-13:]['pred'])

28.99043771141131

# 선형회귀분석모델

In [None]:
# import statsmodels.api as sm
# X = X_train
# y = y_train

# # 상수 (절편)를 추가
# X = sm.add_constant(X)

# # OLS 모델 생성
# model = sm.OLS(y, X)

# # 모델 학습
# results = model.fit()

# # 회귀 결과 요약
# print(results.summary())

# # 회귀 계수 출력
# print("회귀 계수 (coefficients):")
# print(results.params)

# X_t = sm.add_constant(X_test)
# pred = results.predict(X_t)

In [79]:
import requests

date = '2023-11-04'
bid_round = 2
API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJTZ3FicHhyZVVMaGRtaWVuU1JxWWl4IiwiaWF0IjoxNjk4ODk2MTYxLCJleHAiOjE3MDAyMzMyMDAsInR5cGUiOiJhcGlfa2V5In0.I9OvmWqhDhf3ePv8t-hFFWwGCokcSbK7e8-fJfIZ5lU"
gen_fcst = requests.get(f'https://research-api.solarkim.com/cmpt-2023/gen-forecasts/{date}/{bid_round}', headers={
                            'Authorization': f'Bearer {API_KEY}'
                        }).json()
weather_fcst = requests.get(f'https://research-api.solarkim.com/cmpt-2023/weathers-forecasts/{date}/{bid_round}', headers={
                            'Authorization': f'Bearer {API_KEY}'
                        }).json()

gen_fcst = pd.DataFrame(gen_fcst).set_index('time')
gen_fcst['time'] = gen_fcst.index
gen_fcst.index = range(0, 24)
weather_fcst = pd.DataFrame(weather_fcst)

In [91]:
gen_fcst

Unnamed: 0,model1,model2,model3,model4,model5,time
0,0.0,0.0,0.0,0.0,0.0,2023-11-03 16:00:00+00:00
1,0.0,0.0,0.0,0.0,0.0,2023-11-03 17:00:00+00:00
2,0.0,0.0,0.0,0.0,0.0,2023-11-03 18:00:00+00:00
3,0.0,0.0,0.0,0.0,0.0,2023-11-03 19:00:00+00:00
4,0.0,0.0,0.0,0.0,0.0,2023-11-03 20:00:00+00:00
5,0.0,0.0,0.0,0.0,0.0,2023-11-03 21:00:00+00:00
6,0.919664,0.428609,0.119851,0.477187,0.535121,2023-11-03 22:00:00+00:00
7,2.57798,2.72835,0.922678,1.30587,6.65086,2023-11-03 23:00:00+00:00
8,10.4845,7.46373,9.67406,6.45546,24.9172,2023-11-04 00:00:00+00:00
9,21.6479,14.1435,17.0286,13.1828,44.2914,2023-11-04 01:00:00+00:00


In [80]:
gen_fcst['time'] = pd.to_datetime(gen_fcst['time'])
weather_fcst['time'] = pd.to_datetime(weather_fcst['time'])

In [81]:
import pytz
# import datetime
api_data = pd.merge(gen_fcst, weather_fcst, on='time')

api_data['time'] = api_data['time'] + dt.timedelta(hours=9)

In [82]:
api_data['hour'] = api_data['time'].dt.hour
api_data['day'] = api_data['time'].dt.day
api_data['month'] = api_data['time'].dt.month
api_data['date'] = api_data['time'].dt.date
api_data['weekday'] = api_data['time'].dt.weekday

In [84]:
# api_data.rename(columns={'model1' : 'pred'}, inplace=True)

In [85]:
api_data['pred'] = api_data[['model1', 'model2', 'model3', 'model4', 'model5']].mean(axis=1)

In [86]:
api_copy = api_data.copy()

scaler_ = MinMaxScaler()
scaler_.fit(api_copy[col])
api_copy[col] = scaler_.transform(api_copy[col])

In [87]:
api_pred = grid_search.predict(api_copy[x_col])
api_pred = min_max_scaling_reverse(api_pred, api_data['pred'].min(), api_data['pred'].max())
api_pred[0:6] = 0
api_pred[19:] = 0

In [88]:
api_pred = api_pred.astype('float')

In [89]:
prediction = list(api_pred)

In [90]:
prediction

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.526559591293335,
 2.7838077545166016,
 9.783551216125488,
 20.618621826171875,
 23.315998077392578,
 23.355192184448242,
 23.30059242248535,
 22.269672393798828,
 21.520959854125977,
 17.781707763671875,
 5.866546630859375,
 1.4011352062225342,
 1.113796353340149,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [None]:
gen_fcst

In [None]:
# # import requests
# import json

# amounts = prediction
# # API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJTZ3FicHhyZVVMaGRtaWVuU1JxWWl4IiwiaWF0IjoxNjk4ODk2MTYxLCJleHAiOjE3MDAyMzMyMDAsInR5cGUiOiJhcGlfa2V5In0.I9OvmWqhDhf3ePv8t-hFFWwGCokcSbK7e8-fJfIZ5lU"
# success = requests.post(f'https://research-api.solarkim.com/cmpt-2023/bids', data=json.dumps(amounts), headers={
#                             'Authorization': f'Bearer {API_KEY}'
#                         }).json()
# print(success)


# 대회 실제 발전량을 통한 예측