## Import

In [2]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings(action='ignore') 

## Fixed Random-Seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [4]:
train_df = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_전력사용량\train.csv')
test_df = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_전력사용량\test.csv')

## Train Data Pre-Processing

In [5]:
#결측값을 0으로 채웁니다
train_df = train_df.fillna(0)

In [6]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['month'] = train_df['일시'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : int(x[9:11]))

test_df['month'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : int(x[9:11]))

In [7]:
X_train = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
y_train = train_df['전력소비량(kWh)']
test_df.drop(columns=['num_date_time', '일시'], inplace = True)

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(test_df)

In [10]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled, y_train, test_size = 0.2, random_state = 42)

In [18]:
param_grid ={
    'n_estimators' : [100, 200, 300],
    'max_depth' : [15, 20, 25, 30],
    'min_samples_split' : [2, 5, 7, 10],
    'min_samples_leaf' : [1, 2, 4, 7],
    'max_features' : ['auto', 'sqrt', 'log2'],
    'bootstrap' : [True, False]
}

In [19]:
rf_model = RandomForestRegressor()

In [22]:
grid_search = GridSearchCV(estimator = rf_model, param_grid = param_grid,
                           cv = 3, n_jobs = -1, verbose = 2, scoring = 'neg_root_mean_squared_error')
grid_search.fit(X_train_split, y_train_split)

print('Best HP:', grid_search.best_params_)
print('Best Score:', grid_search.best_score_)

Fitting 3 folds for each of 1152 candidates, totalling 3456 fits
Best HP: {'bootstrap': False, 'max_depth': 30, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: -527.6035698250012


In [23]:
y_val_pred_best = grid_search.best_estimator_.predict(X_val_split)
val_rmse_best = np.sqrt(mean_squared_error(y_val_split, y_val_pred_best))
print('validation RMSE with Best Model :', val_rmse_best)

validation RMSE with Best Model : 479.72034955722813


In [25]:
y_test_pred_best = grid_search.best_estimator_.predict(X_test_scaled)

## Submission

In [29]:
submission = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_전력사용량\sample_submission.csv')
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,0
1,1_20220825 01,0
2,1_20220825 02,0
3,1_20220825 03,0
4,1_20220825 04,0
...,...,...
16795,100_20220831 19,0
16796,100_20220831 20,0
16797,100_20220831 21,0
16798,100_20220831 22,0


In [30]:
submission['answer'] = y_test_pred_best
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2090.14170
1,1_20220825 01,1937.37190
2,1_20220825 02,1964.78195
3,1_20220825 03,1750.57395
4,1_20220825 04,1576.93415
...,...,...
16795,100_20220831 19,965.68695
16796,100_20220831 20,943.96350
16797,100_20220831 21,844.57410
16798,100_20220831 22,768.02550


In [31]:
submission.to_csv('C:/Users/dlwks/OneDrive/바탕 화면/VSCode/DACON_전력사용량/rf0811-3.csv', index=False)