In [85]:
import os
import gc
import time

import numpy as np
import pandas as pd
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from cachetools import cached, LRUCache

import warnings
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format
warnings.filterwarnings('ignore')


In [86]:

train = pd.read_csv("./train_all.csv")

In [87]:
train.columns

Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion',
       'transactions', 'is_holiday', 'city', 'state', 'type', 'cluster',
       'dcoilwtico_interpolated', 'weekday', 'year', 'month', 'day',
       'Rolling_Std_Sales_3d', 'Rolling_Mean_Sales_7d', 'Rolling_Std_Sales_7d',
       'Rolling_Mean_Sales_14d', 'Rolling_Std_Sales_14d'],
      dtype='object')

In [88]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1280373 entries, 0 to 1280372
Data columns (total 22 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   id                       1280373 non-null  int64  
 1   date                     1280373 non-null  object 
 2   store_nbr                1280373 non-null  int64  
 3   family                   1280373 non-null  object 
 4   sales                    1280373 non-null  float64
 5   onpromotion              1280373 non-null  int64  
 6   transactions             1280373 non-null  float64
 7   is_holiday               1280373 non-null  int64  
 8   city                     1280373 non-null  object 
 9   state                    1280373 non-null  object 
 10  type                     1280373 non-null  object 
 11  cluster                  1280373 non-null  int64  
 12  dcoilwtico_interpolated  1280373 non-null  float64
 13  weekday                  1280373 non-null 

In [89]:
# ▶ 학습에 필요없는 Column 제거
X = train.drop(['id', 'date', 'city', 'state'], axis=1)
X.head()

Unnamed: 0,store_nbr,family,sales,onpromotion,transactions,is_holiday,type,cluster,dcoilwtico_interpolated,weekday,year,month,day,Rolling_Std_Sales_3d,Rolling_Mean_Sales_7d,Rolling_Std_Sales_7d,Rolling_Mean_Sales_14d,Rolling_Std_Sales_14d
0,25,OTHERS,24.0,0,770.0,1,D,1,93.14,2,2013,1,1,555.79,24.0,555.79,24.0,555.79
1,25,BEVERAGES,810.0,0,770.0,1,D,1,93.14,2,2013,1,1,555.79,417.0,555.79,417.0,555.79
2,25,BREAD/BAKERY,180.59,0,770.0,1,D,1,93.14,2,2013,1,1,416.03,338.2,416.03,338.2,416.03
3,25,CLEANING,186.0,0,770.0,1,D,1,93.14,2,2013,1,1,361.84,300.15,348.1,300.15,348.1
4,25,DAIRY,143.0,0,770.0,1,D,1,93.14,2,2013,1,1,23.42,268.72,309.55,268.72,309.55


In [90]:
# ▶ 숫자형(Integer), 범주형(Categorical) 변수 분할
numerical_list=[]
categorical_list=[]

for i in X.columns :
  if train[i].dtypes == 'O': # O : object type
    categorical_list.append(i)
  else :
    numerical_list.append(i)

print("categorical_list :", categorical_list)
print("numerical_list :", numerical_list)

categorical_list : ['family', 'type']
numerical_list : ['store_nbr', 'sales', 'onpromotion', 'transactions', 'is_holiday', 'cluster', 'dcoilwtico_interpolated', 'weekday', 'year', 'month', 'day', 'Rolling_Std_Sales_3d', 'Rolling_Mean_Sales_7d', 'Rolling_Std_Sales_7d', 'Rolling_Mean_Sales_14d', 'Rolling_Std_Sales_14d']


In [91]:
# 피처 선택을 위한 데이터 전처리
X = pd.get_dummies(X, columns=['store_nbr', 'family', 'type', 'cluster', 'weekday', 'month'], drop_first=True)

# 데이터 타입 최적화
for col in X.select_dtypes(include=['float64']).columns:
    X[col] = X[col].astype('float32')
for col in X.select_dtypes(include=['int64']).columns:
    X[col] = X[col].astype('int32')

In [92]:
# ▶ 모델링을 학습하기 위한 Feature(X)와 Target(Y)데이터를 구분하는 단계 
from sklearn.model_selection import train_test_split

X = X.drop(['sales'], axis=1)
Y = train['sales']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

(1024298, 116)
(1024298,)
(256075, 116)
(256075,)


In [93]:
x_train.iloc[0,:], x_test.iloc[0,:]

(onpromotion                     3
 transactions              5035.00
 is_holiday                      0
 dcoilwtico_interpolated     58.79
 year                         2015
                             ...  
 month_8                     False
 month_9                     False
 month_10                    False
 month_11                    False
 month_12                    False
 Name: 612143, Length: 116, dtype: object,
 onpromotion                     0
 transactions              1148.00
 is_holiday                      0
 dcoilwtico_interpolated    106.61
 year                         2013
                             ...  
 month_8                      True
 month_9                     False
 month_10                    False
 month_11                    False
 month_12                    False
 Name: 141340, Length: 116, dtype: object)

In [94]:
# ▶ Logistic Regression 표준화(standardization) (※정규화(0~1), Normalization)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_sc = scaler.fit_transform(x_train)
x_test_sc = scaler.transform(x_test)

In [69]:
train.columns

Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion',
       'transactions', 'is_holiday', 'type', 'cluster',
       'dcoilwtico_interpolated', 'weekday', 'year', 'month', 'day',
       'Rolling_Std_Sales_3d', 'Rolling_Mean_Sales_7d', 'Rolling_Std_Sales_7d',
       'Rolling_Mean_Sales_14d', 'Rolling_Std_Sales_14d'],
      dtype='object')

In [95]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_log_error


# XGBoost DMatrix 생성
dtrain = xgb.DMatrix(x_train_sc, label=y_train)
dtest = xgb.DMatrix(x_test_sc, label=y_test)

params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.1,
    'max_depth': 6,
    'colsample_bytree': 0.8,
    'subsample': 0.8
}

# XGBoost 모델 학습
evals = [(dtrain, 'train'), (dtest, 'eval')]
bst = xgb.train(params, dtrain, num_boost_round=100, early_stopping_rounds=50, evals=evals)

# 예측
y_pred_xgb = bst.predict(dtest)



# RMSLE 계산 함수
def rmsle(y_true, y_pred):
    y_true = np.maximum(y_true, 0)  # 음수 값을 0으로 변환
    y_pred = np.maximum(y_pred, 0)  # 음수 값을 0으로 변환
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# 평가
rmsle_xgb = rmsle(y_test, y_pred_xgb)
print(f'XGBoost RMSLE: {rmsle_xgb}')


[0]	train-rmse:1440.94577	eval-rmse:1440.59457
[1]	train-rmse:1331.82075	eval-rmse:1332.54196
[2]	train-rmse:1237.92639	eval-rmse:1239.39896
[3]	train-rmse:1156.32749	eval-rmse:1158.77589
[4]	train-rmse:1086.85869	eval-rmse:1090.22901
[5]	train-rmse:1022.25291	eval-rmse:1026.96557
[6]	train-rmse:971.06883	eval-rmse:976.17853
[7]	train-rmse:915.21479	eval-rmse:921.35134
[8]	train-rmse:871.22925	eval-rmse:878.52342
[9]	train-rmse:832.81587	eval-rmse:840.84171
[10]	train-rmse:797.51756	eval-rmse:805.71764
[11]	train-rmse:762.36387	eval-rmse:771.19503
[12]	train-rmse:732.26635	eval-rmse:741.64027
[13]	train-rmse:705.04019	eval-rmse:715.05024
[14]	train-rmse:687.12420	eval-rmse:697.06005
[15]	train-rmse:667.31719	eval-rmse:677.41297
[16]	train-rmse:646.70473	eval-rmse:657.15079
[17]	train-rmse:632.55023	eval-rmse:643.25393
[18]	train-rmse:622.64959	eval-rmse:633.89887
[19]	train-rmse:607.63769	eval-rmse:619.30608
[20]	train-rmse:592.91586	eval-rmse:605.06016
[21]	train-rmse:584.19063	eval-r

In [98]:
from sklearn.ensemble import RandomForestRegressor 

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(x_train_sc, y_train)

# 예측
y_pred_rf = rf.predict(x_test_sc)

# RMSLE 계산 함수
def rmsle(y_true, y_pred):
    y_true = np.maximum(y_true, 0)  # 음수 값을 0으로 변환
    y_pred = np.maximum(y_pred, 0)  # 음수 값을 0으로 변환
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# 평가
rmsle_rf = rmsle(y_test, y_pred_rf)
print(f'Random Forest RMSLE: {rmsle_rf}')

In [11]:
# ▶ BayesianOptimization 설치
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl.metadata (543 bytes)
Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.4.3


In [None]:
# ▶ BayesianOptimization
import numpy as np
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score


def model_evaluate(n_estimators, maxDepth):
    clf = RandomForestClassifier(
        n_estimators= int(n_estimators),
        max_depth= int(maxDepth))
    scores = cross_val_score(clf, x_train, y_train, cv=5, scoring='roc_auc')
    return np.mean(scores)
    
    
def bayesOpt(x_train, y_train):
    clfBO = BayesianOptimization(model_evaluate, {'n_estimators':  (100, 200),
                                                  'maxDepth': (2, 4)
                                                 })
    clfBO.maximize(init_points=5, n_iter=10)
    print(clfBO.res)

bayesOpt(X_train_scaled, y_train)