In [1]:
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns  
import warnings
import numpy as np
import matplotlib.font_manager as fm

from matplotlib import font_manager, rc
from scipy.stats import skew, kurtosis
from factor_analyzer import FactorAnalyzer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sktime.forecasting.model_selection import temporal_train_test_split
from sklearn.metrics import mean_squared_error

font_path = 'C:/WINDOWS/Fonts/Hancom Gothic Bold.ttf'
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

warnings.filterwarnings('ignore')

In [2]:
path = "C:/Users/USER/Desktop/OIBC2023_data/OIBC2023_data/{}"

incentive = pd.read_csv(path.format('incentive.csv'))
pred = pd.read_csv(path.format('pred.csv'))
weather_actual = pd.read_csv(path.format('weather_actual.csv'))
weather_forecast = pd.read_csv(path.format('weather_forecast.csv'))
gens = pd.read_csv(path.format('gens.csv'))

In [3]:
incentive['time'] = pd.to_datetime(incentive['time'])
pred['time'] = pd.to_datetime(pred['time'])
weather_actual['time'] = pd.to_datetime(weather_actual['time'])
weather_forecast['time'] = pd.to_datetime(weather_forecast['time'])
gens['time'] = pd.to_datetime(gens['time'])

In [4]:
gens['hour'] = gens['time'].dt.hour
# gens['day'] = gens['time'].dt.day
gens['month'] = gens['time'].dt.month
# gens['date'] = gens['time'].dt.date
# gens['weekday'] = gens['time'].dt.weekday

In [5]:
def preprocessing(prediction, wf):
  #라운드 구분
  df1 = prediction[prediction['round'] == 1]
  df2 = prediction[prediction['round'] == 2]

  #시간별 각 모델의 발전량
  df1 = df1.pivot(index='time', columns='model_id', values='amount').reset_index()
  df2 = df2.pivot(index='time', columns='model_id', values='amount').reset_index()

  #컬럼명 수정
  df1.columns = ['time','model0','model1','model2','model3','model4']
  df2.columns = ['time','model0','model1','model2','model3','model4']

  #Weather_forecast
  wf_1 = wf[wf['round'] == 1]
  wf_2 = wf[wf['round'] == 2]

  #데이터 병합
  a = df1.merge(wf_1, on='time').set_index('time')
  b = df2.merge(wf_2, on='time').set_index('time')

  #예측하기 편하게 수정
  a = a[['cloud', 'temp', 'humidity', 'ground_press', 'wind_speed',
       'wind_dir', 'rain', 'snow', 'dew_point', 'vis', 'uv_idx', 'azimuth',
       'elevation','model0','model1','model2','model3','model4']]
  b = b[['cloud', 'temp', 'humidity', 'ground_press', 'wind_speed',
       'wind_dir', 'rain', 'snow', 'dew_point', 'vis', 'uv_idx', 'azimuth',
       'elevation','model0','model1','model2','model3','model4']]

  return a,b

In [6]:
round1, round2 = preprocessing(pred, weather_forecast)

round1['pred_mean'] = round1[['model1', 'model2', 'model3', 'model4', 'model0']].mean(axis=1)
round2['pred_mean'] = round2[['model1', 'model2', 'model3', 'model4', 'model0']].mean(axis=1)

round1 = pd.merge(round1, gens, on='time')
round2 = pd.merge(round2, gens, on='time')

In [7]:
round1.columns

Index(['time', 'cloud', 'temp', 'humidity', 'ground_press', 'wind_speed',
       'wind_dir', 'rain', 'snow', 'dew_point', 'vis', 'uv_idx', 'azimuth',
       'elevation', 'model0', 'model1', 'model2', 'model3', 'model4',
       'pred_mean', 'amount', 'hour', 'month'],
      dtype='object')

In [8]:
def min_max_scaling_reverse(scaled_value, min_val, max_val):
    original_value = scaled_value * (max_val - min_val) + min_val
    return original_value

# XGBoost

In [9]:
col = ['cloud', 'temp', 'humidity',
       'ground_press', 'wind_speed', 'wind_dir', 'rain', 'snow', 'dew_point',
       'vis', 'uv_idx', 'azimuth', 'elevation', 'pred_mean']

In [10]:
one = round1.copy()
two = round2.copy()

In [11]:
# scaler_ = MinMaxScaler()
# scaler_.fit(one[col])
# one[col] = scaler_.transform(one[col])

# scaler_ = MinMaxScaler()
# scaler_.fit(two[col])
# two[col] = scaler_.transform(two[col])

In [12]:
x_col = ['hour', 'month', 'cloud', 'temp', 'humidity',
       'ground_press', 'wind_speed', 'wind_dir', 'rain', 'snow', 'dew_point',
       'vis', 'uv_idx', 'azimuth', 'elevation', 'pred_mean']

In [13]:
# scaler_.fit(one[['amount']])
# one[['amount']] = scaler_.transform(one[['amount']])

# scaler_.fit(two[['amount']])
# two[['amount']] = scaler_.transform(two[['amount']])

In [14]:
# def min_max_scaling_reverse(scaled_value, min_val, max_val):
#     original_value = scaled_value * (max_val - min_val) + min_val
#     return original_value

In [15]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit

In [16]:
param_grid = {
    'n_estimators': [100, 200, 300],  # 트리의 개수
    'learning_rate': [0.01, 0.1, 0.2],  # 학습률
    'max_depth': [3, 4, 5],  # 트리의 최대 깊이
    'subsample': [0.8, 0.9, 1.0],  # 샘플링 비율
}


In [17]:
X = one[x_col]
y = one.amount
# # 데이터를 학습용과 테스트용으로 나눕니다
X_train, X_test, y_train, y_test = temporal_train_test_split(X, y, test_size=24)

xgb_model = xgb.XGBRegressor(objective='reg:squarederror')
# xgb_model.fit(X, y)
tscv = TimeSeriesSplit(n_splits=5)  # 5개의 시계열 데이터 분할을 사용

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=1)
grid_search.fit(X_train, y_train)  # X와 y는 시계열 데이터 및 해당 타겟 값입니다.

# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_

GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    gamma=None, gpu_id=None, grow_policy=None,
                                    importance_type=None,
                                    i...
                                    max_depth=None, max_leaves=None,
                                    min_child_weight=None, missing=nan,
                                    monotone_constraints=None, n_estimators=100,
                                    n_jobs=None, num_parallel_tree=None,
                                 

In [18]:
# pred = grid_search.predict(X_test)
# pred = min_max_scaling_reverse(pred, round1['amount'].min(), round1['amount'].max())
# mean_squared_error(y_test, pred)

# LR

In [19]:
from sklearn.linear_model import LinearRegression

In [20]:
# # 랜덤포레스트 모델 생성
# lr = LinearRegression()
# 42
# X = two[x_col]
# y = two.amount

# # # Grid Search를 위한 하이퍼파라미터 그리드 정의
# # param_grid = {
# #     'n_estimators': [100, 200, 300],
# #     'max_depth': [None, 10, 20, 30],
# #     'min_samples_split': [2, 5, 10],
# #     'min_samples_leaf': [1, 2, 4],
# #     'max_features': ['auto', 'sqrt'],
# #     'bootstrap': [True, False]
# # }

# # # Grid Search 객체 생성
# # rf_grid_model = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=1, verbose=2)

# # Grid Search 수행
# lr.fit(X, y)

# # 최적 하이퍼파라미터 및 성능 출력
# # print("최적 하이퍼파라미터:", grid_search.best_params_)
# # print("최적 모델의 정확도:", grid_search.best_score_)

# API 호출

In [33]:
import requests

date = '2023-11-05'
bid_round = 1
API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJTZ3FicHhyZVVMaGRtaWVuU1JxWWl4IiwiaWF0IjoxNjk4ODk2MTYxLCJleHAiOjE3MDAyMzMyMDAsInR5cGUiOiJhcGlfa2V5In0.I9OvmWqhDhf3ePv8t-hFFWwGCokcSbK7e8-fJfIZ5lU"
gen_fcst = requests.get(f'https://research-api.solarkim.com/cmpt-2023/gen-forecasts/{date}/{bid_round}', headers={
                            'Authorization': f'Bearer {API_KEY}'
                        }).json()
weather_fcst = requests.get(f'https://research-api.solarkim.com/cmpt-2023/weathers-forecasts/{date}/{bid_round}', headers={
                            'Authorization': f'Bearer {API_KEY}'
                        }).json()

gen_fcst = pd.DataFrame(gen_fcst).set_index('time')
gen_fcst['time'] = gen_fcst.index
gen_fcst.index = range(0, 24)
weather_fcst = pd.DataFrame(weather_fcst)

In [34]:
gen_fcst['time'] = pd.to_datetime(gen_fcst['time'])
weather_fcst['time'] = pd.to_datetime(weather_fcst['time'])

In [35]:
import pytz
# import datetime
api_data = pd.merge(gen_fcst, weather_fcst, on='time')

api_data['time'] = api_data['time'] + dt.timedelta(hours=9)

In [36]:
api_data['hour'] = api_data['time'].dt.hour
api_data['day'] = api_data['time'].dt.day
api_data['month'] = api_data['time'].dt.month
api_data['date'] = api_data['time'].dt.date
api_data['weekday'] = api_data['time'].dt.weekday

api_data['pred_mean'] = api_data[['model1', 'model2', 'model3', 'model4', 'model5']].mean(axis=1)

# scaler_ = MinMaxScaler()
# scaler_.fit(api_data[col])
# api_data[col] = scaler_.transform(api_data[col])

api_pred = grid_search.predict(api_data[x_col])
# api_pred = min_max_scaling_reverse(api_pred, api_data['pred'].min(), api_data['pred'].max())
api_pred[0:6] = 0
api_pred[19:] = 0
api_pred = api_pred.astype('float')
api_pred = pd.DataFrame(api_pred)
api_pred['hour'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 0]
api_pred.rename(columns={0 : 'pred'}, inplace=True)

In [39]:
api_pred

Unnamed: 0,pred,hour
0,0.0,1
1,0.0,2
2,0.0,3
3,0.0,4
4,0.0,5
5,0.0,6
6,0.096278,7
7,2.234872,8
8,11.354073,9
9,17.360378,10


# 후처리

In [37]:
mean_gens = gens.groupby('hour').mean()['amount'].reset_index()
mean_gens = pd.merge(api_pred, mean_gens, on='hour')

submit = mean_gens.copy()
condition = submit['pred'] < submit['amount']
selected_rows = submit[condition]

# 선택된 행에서 'a'열과 'b'열의 평균 계산
mean_values = selected_rows[['pred', 'amount']].mean(axis=1)
# mean_values = selected_rows[['amount']]

# 선택된 행에 평균값 할당
submit.loc[condition, ['pred', 'amount']] = mean_values

In [38]:
submit

Unnamed: 0,pred,hour,amount
0,0.0,1,0.0
1,0.0,2,0.0
2,0.0,3,0.0
3,0.0,4,0.0
4,0.0,5,0.0
5,0.066171,6,0.066171
6,1.044402,7,1.044402
7,5.649329,8,5.649329
8,17.160381,9,17.160381
9,27.750857,10,27.750857


In [27]:
# # import requests
# import json

# amounts = list(submit['pred'])
# # API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJTZ3FicHhyZVVMaGRtaWVuU1JxWWl4IiwiaWF0IjoxNjk4ODk2MTYxLCJleHAiOjE3MDAyMzMyMDAsInR5cGUiOiJhcGlfa2V5In0.I9OvmWqhDhf3ePv8t-hFFWwGCokcSbK7e8-fJfIZ5lU"
# success = requests.post(f'https://research-api.solarkim.com/cmpt-2023/bids', data=json.dumps(amounts), headers={
#                             'Authorization': f'Bearer {API_KEY}'
#                         }).json()
# print(success)

# test data로 검증

In [28]:
api_gens = pd.read_excel(path.format('api_gens.xlsx'))

In [29]:
date_gens = api_gens[api_gens['time'] == date].reset_index(drop=True)
date_gens['pred'] = submit['pred']

In [30]:
date_gens['error'] = date_gens['amount'] - date_gens['pred'].shift(1)

In [31]:
date_gens

Unnamed: 0,time,hour,amount,pred,error
0,2023-11-02,0,0,0.0,
1,2023-11-02,1,0,0.0,0.0
2,2023-11-02,2,0,0.0,0.0
3,2023-11-02,3,0,0.0,0.0
4,2023-11-02,4,0,0.0,0.0
5,2023-11-02,5,0,0.066171,0.0
6,2023-11-02,6,0,1.212263,-0.066171
7,2023-11-02,7,0,6.90866,-1.212263
8,2023-11-02,8,6,19.336714,-0.90866
9,2023-11-02,9,34,39.981407,14.663286
