In [None]:
!pip install category_encoders
!pip install xgboost==1.7.2
!pip install scikit-learn==1.1.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# 라이브러리 불러오기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from category_encoders import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, Ridge, Lasso
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error

In [None]:
# 파일 업로드
from google.colab import files
files.upload()
data = pd.read_csv('final_data_2.csv')

Saving final_data_2.csv to final_data_2 (1).csv


In [None]:
df = data.copy()

**전처리까지 모두 마친 데이터이기 때문에 바로 데이터셋 분리 진행**




In [None]:
df.columns

Index(['season', 'month', 'day', 'installed_capacity', 'full_power',
       'average_temperature', 'highest_temperature', 'lowest_temperature',
       'precipitation', 'average_wind_speed', 'max_wind_speed',
       'average_humidity', 'lowest_humidity'],
      dtype='object')

In [None]:
# 데이터셋 분리
target = 'full_power'
features = df.drop(target, axis=1).columns

X = df[features]
y = df[target]

In [None]:
# train/val/test 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2)

In [None]:
# 확인
print('train set:', X_train.shape, y_train.shape)
print('val set:', X_val.shape, y_val.shape)
print('test set:', X_test.shape, y_test.shape)

train set: (4361, 12) (4361,)
val set: (1091, 12) (1091,)
test set: (1364, 12) (1364,)


In [None]:
# 스케일링
numeric_feats = X_train.dtypes[X_train.dtypes != "object"].index

scaler = StandardScaler()
X_train[numeric_feats] = scaler.fit_transform(X_train[numeric_feats])
X_val[numeric_feats] = scaler.transform(X_val[numeric_feats])
X_test[numeric_feats] = scaler.transform(X_test[numeric_feats])

In [None]:
# 인코딩
ohe = OneHotEncoder()

X_train_ohe = ohe.fit_transform(X_train)
X_val_ohe = ohe.transform(X_val)
X_test_ohe = ohe.transform(X_test)

In [None]:
# 기준모델 생성
baseline = [y_train.mean()] * len(y_train)

# 기준모델 r2, mae
base_r2 = r2_score(y_train,baseline)
base_mae = mean_absolute_error(y_train,baseline)

# 기준모델 r2, mae 확인
print('baseline r2 score : ', base_r2)
print('baseline mae score : ', base_mae)

baseline r2 score :  0.0
baseline mae score :  9164.591139354521


In [None]:
# score을 반환하는 함수
def print_score(model, X_train, y_train, X_val, y_val) :

    pred_train = model.predict(X_train)
    pred_val = model.predict(X_val)
    print('train r2 score : ', r2_score(y_train,pred_train))
    print('val MAE score : ', mean_absolute_error(y_val,pred_val))
    print('')
    print('train MAE score : ', mean_absolute_error(y_train,pred_train))
    print('val r2 score : ', r2_score(y_val,pred_val))

    return None

### Multiple Linear Regression

In [None]:
# 다중선형회귀 학습
multiple_ols = LinearRegression()
multiple_ols.fit(X_train_ohe,y_train)

# 다중선형회귀 r2, mae
pred_train = multiple_ols.predict(X_train_ohe)
pred_val = multiple_ols.predict(X_val_ohe)
multiple_r2 = r2_score(y_val,pred_val)
multiple_mae = mean_absolute_error(y_val,pred_val)

# 다중선형회귀 r2, mae 확인
print('train r2 score : ', r2_score(y_train,pred_train))
print('train MAE score : ', mean_absolute_error(y_train,pred_train))
print('')
print('val r2 score : ', multiple_r2)
print('val MAE score : ', multiple_mae)

train r2 score :  0.6164334214204479
train MAE score :  5665.965818385466

val r2 score :  0.6442611636714077
val MAE score :  5499.649969457047


### RidgeCV

In [None]:
# ridgecv 학습
alphas = np.arange(1, 100, 1)
ridge = RidgeCV(alphas=alphas, cv=5)
ridge.fit(X_train_ohe, y_train)

# 최적의 알파값
print("alpha: ", ridge.alpha_)
print()

# ridgecv r2, mae
pred_train = ridge.predict(X_train_ohe)
pred_val = ridge.predict(X_val_ohe)
ridge_r2 = r2_score(y_val,pred_val)
ridge_mae = mean_absolute_error(y_val,pred_val)

# ridgecv r2, mae 확인
print('train r2 score : ', r2_score(y_train,pred_train))
print('train MAE score : ', mean_absolute_error(y_train,pred_train))
print('')
print('val r2 score : ', ridge_r2)
print('val MAE score : ', ridge_mae)

alpha:  13

train r2 score :  0.6163905872641631
train MAE score :  5668.15367957241

val r2 score :  0.6438456174602831
val MAE score :  5505.418256666469


### LassoCV

In [None]:
# lassocv 학습
alphas = np.arange(1, 100, 1)
lasso = LassoCV(alphas=alphas, cv=5)
lasso.fit(X_train_ohe, y_train)

# 최적의 알파값
print("alpha: ", lasso.alpha_)
print()

# lassocv r2, mae
pred_train = lasso.predict(X_train_ohe)
pred_val = lasso.predict(X_val_ohe)
lasso_r2 = r2_score(y_val,pred_val)
lasso_mae = mean_absolute_error(y_val,pred_val)

# lassocv r2, mae 확인
print('train r2 score : ', r2_score(y_train,pred_train))
print('train MAE score : ', mean_absolute_error(y_train,pred_train))
print('')
print('val r2 score : ', lasso_r2)
print('val MAE score : ', lasso_mae)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


alpha:  5

train r2 score :  0.6163673209201892
train MAE score :  5667.441700779584

val r2 score :  0.644050442960068
val MAE score :  5501.917280397582


### DecisionTreeRegressor

In [None]:
# 파이프라인 생성(과대적합을 피하도록 최소한의 하이퍼파라미터만 조정)
pipe_dt = make_pipeline(
    OneHotEncoder(),
    DecisionTreeRegressor(max_depth=4)
)

# 학습
pipe_dt.fit(X_train, y_train)

# pipe_dt r2, mae
pred_train = pipe_dt.predict(X_train)
pred_val = pipe_dt.predict(X_val)
pipe_dt_r2 = r2_score(y_val,pred_val)
pipe_dt_mae = mean_absolute_error(y_val,pred_val)

# pipe_dt r2, mae 확인
print('train r2 score : ', r2_score(y_train,pred_train))
print('train MAE score : ', mean_absolute_error(y_train,pred_train))
print('')
print('val r2 score : ', pipe_dt_r2)
print('val MAE score : ', pipe_dt_mae)

train r2 score :  0.7347686048133429
train MAE score :  4560.720022011655

val r2 score :  0.7344038202150652
val MAE score :  4685.29153831597


### RandomForestRegressor

In [None]:
# 파이프라인 생성(과대적합을 피하도록 최소한의 하이퍼파라미터만 조정)
pipe_rf = make_pipeline(
    OneHotEncoder(),
    RandomForestRegressor(max_depth=3)
)

# 학습
pipe_rf.fit(X_train, y_train)

# pipe_rf r2, mae
pred_train = pipe_rf.predict(X_train)
pred_val = pipe_rf.predict(X_val)
pipe_rf_r2 = r2_score(y_val,pred_val)
pipe_rf_mae = mean_absolute_error(y_val,pred_val)

# pipe_rf r2, mae 확인
print('train r2 score : ', r2_score(y_train,pred_train))
print('train MAE score : ', mean_absolute_error(y_train,pred_train))
print('')
print('val r2 score : ', pipe_rf_r2)
print('val MAE score : ', pipe_rf_mae)

train r2 score :  0.7021992253463998
train MAE score :  4809.450902756084

val r2 score :  0.709619743582212
val MAE score :  4873.315293420683


### XGBRegressor

In [None]:
# 파이프라인 생성(과대적합을 피하도록 최소한의 하이퍼파라미터만 조정)
pipe_xgb = make_pipeline(
    OneHotEncoder(),
    XGBRegressor(eval_metric="mae", max_depth=2)
)

# 학습
pipe_xgb.fit(X_train, y_train)

# pipe_xgb r2, mae
pred_train = pipe_xgb.predict(X_train)
pred_val = pipe_xgb.predict(X_val)
pipe_xgb_r2 = r2_score(y_val,pred_val)
pipe_xgb_mae = mean_absolute_error(y_val,pred_val)

# pipe_xgb r2, mae 확인
print('train r2 score : ', r2_score(y_train,pred_train))
print('train MAE score : ', mean_absolute_error(y_train,pred_train))
print('')
print('val r2 score : ', pipe_xgb_r2)
print('val MAE score : ', pipe_xgb_mae)

train r2 score :  0.8113755206104004
train MAE score :  3953.891783170431

val r2 score :  0.7985397894123332
val MAE score :  4184.988044941568


**val score로 모델 비교**

In [None]:
comprison = pd.DataFrame(
    index=['base','Multiple','Ridge','lasso', 'DecisionTree', 'RandomForest', 'XGB'],
    columns = ['r2','mae'])
comprison['r2'] =[base_r2,multiple_r2, ridge_r2, lasso_r2, pipe_dt_r2, pipe_rf_r2, pipe_xgb_r2]
comprison['mae'] =[base_mae,multiple_mae, ridge_mae, lasso_mae, pipe_dt_mae, pipe_rf_mae, pipe_xgb_mae]
comprison

Unnamed: 0,r2,mae
base,0.0,9164.591139
Multiple,0.644261,5499.649969
Ridge,0.643846,5505.418257
lasso,0.64405,5501.91728
DecisionTree,0.663827,5129.884111
RandomForest,0.70962,4873.315293
XGB,0.79854,4184.988045


### 최종 모델 ➡️ XGBRegressor