In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### pip install

In [None]:
!pip uninstall sklearn -y
!pip install --upgrade sklearn
!pip install scikit-learn==0.23.2 --user
# 설치 후 런타임 다시시작

In [None]:
import sklearn
sklearn.__version__ # '0.23.2'

!pip install pycaret
from pycaret.utils import enable_colab
enable_colab()

!pip install markupsafe==2.0.1
# 설치 후 런타임 다시시작

In [None]:
import jinja2
from pycaret.regression import *

#!pip install optuna
#import optuna 
#from optuna import Trial, visualization
#from optuna.samplers import TPESampler

!pip install catboost

#### import

In [4]:
# 함수
import numpy as np
import pandas as pd

# 전처리 
import datetime as dt

# 결측치
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer 

# 모델
import joblib
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV

# 경고
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning) #경고메세지 끄기
warnings.simplefilter(action='ignore', category=FutureWarning) # FutureWarning 제거
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

# 성능 평가
from sklearn.metrics import mean_squared_error
def RMSE(y_pred, y):
    return mean_squared_error(y_pred, y)**0.5

# 데이터
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv("data/sample_submission.csv")

#### 데이터 전처리

In [5]:
# 날짜 전처리
def date_conv(df):
    df['Date'] = pd.to_datetime(df['Date'], format="%d/%m/%Y") # object -> datetime
    df['Year'] = df['Date'].dt.year # 연
    df['Month'] =df['Date'].dt.month # 월
    df['Day'] =df['Date'].dt.day # 일
    df['Week'] = df['Date'].dt.isocalendar().week # 요일
    df = df.drop(columns=['Date']) # Date 삭제
    return df
    
train = date_conv(train)
test = date_conv(test)
train.shape

(6255, 16)

In [6]:
# IsHoliday변수의 타입변환(Bool -> int)
def num_holiday(bool_holiday):
    if bool_holiday == True:
        number = 1
    else:
        number = 0
    return number

def change_holiday(df):
    df['IsHoliday'] = df['IsHoliday'].apply(num_holiday)
    return df

train = change_holiday(train)
test = change_holiday(test)
train.shape

(6255, 16)

#### 데이터 결측치 

In [7]:
# 결측치 처리(안해도 된다. 왜냐하면 모델에선 사용안함)
train_copy = train.copy()
test_copy = test.copy()

def null(df):
    df = IterativeImputer(random_state=2021).fit_transform(df)
    df = pd.DataFrame(df)
    if df.shape[1] == train_copy.shape[1]:
        df.columns = train_copy.columns
    else:
        df.columns = test_copy.columns 
    return df

train = null(train)
test = null(test)
train.shape

(6255, 16)

In [8]:
# int형 전환(float -> int)
def change_int(df):
    null(df)
    df['Store'] = df['Store'].apply(lambda x: int(x))
    df['Year'] = df['Year'].apply(lambda x: int(x))
    df['Month'] =df['Month'].apply(lambda x: int(x))
    df['Day'] =df['Day'].apply(lambda x: int(x))
    df['Week'] =df['Week'].apply(lambda x: int(x))
    return df
    
# 최종 데이터셋   
train = change_int(train)
test = change_int(test)
train

Unnamed: 0,id,Store,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Weekly_Sales,Year,Month,Day,Week
0,1.0,1,42.31,2.572,10187.176756,4080.964301,2998.807907,4638.248813,6199.244732,8.106,0.0,1643690.90,2010,2,5,5
1,2.0,1,38.51,2.548,10179.649970,4078.265127,2993.189216,4633.968812,6193.943659,8.106,1.0,1641957.44,2010,2,12,6
2,3.0,1,39.93,2.514,10077.798248,4031.822685,2888.161209,4576.087318,6102.557866,8.106,0.0,1611968.17,2010,2,19,7
3,4.0,1,46.63,2.561,9400.329944,3718.698348,2177.621841,4192.943359,5486.450234,8.106,0.0,1409727.59,2010,2,26,8
4,5.0,1,46.50,2.625,9888.124397,3943.320154,2686.956948,4468.823642,5928.534937,8.106,0.0,1554806.68,2010,3,5,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,6251.0,45,75.09,3.867,23641.300000,6.000000,92.930000,6988.310000,3992.130000,8.684,0.0,734297.87,2012,8,31,35
6251,6252.0,45,75.70,3.911,11024.450000,12.800000,52.630000,1854.770000,2055.700000,8.684,1.0,766512.66,2012,9,7,36
6252,6253.0,45,67.87,3.948,11407.950000,2568.572836,4.300000,3421.720000,5268.920000,8.684,0.0,702238.27,2012,9,14,37
6253,6254.0,45,65.32,4.038,8452.200000,92.280000,63.240000,2376.380000,8670.400000,8.684,0.0,723086.20,2012,9,21,38


#### 모델 및 학습

In [None]:
# 런타임 40분
# 모델링 
cat = CatBoostRegressor()

# 매개변수
params = {'n_estimators': [200, 500, 1000, 2000], 
          'learning_rate': [0.1, 0.05, 0.01], 
          'max_depth': [5, 6, 7, 8], 
          'subsample': [0.6, 0.8, 0.9, 1.0]}

# 예측시 사용할 변수들
features = ['Year', 'Month', 'Day', 'IsHoliday']

# 모델, 성능(error) 저장
models = []
models.append(0) # train['Store']==0은 없으므로 미리 0을 추가
error = []
for store in range(1, 46):
    train_store = train[train['Store']==store]
    
    # 연도별 데이터 분해
    train2010 = train_store[(train_store['Year']==2010) & (train_store['Month']<=9)]
    train2011 = train_store[(train_store['Year']==2011) & (train_store['Month']<=9)]
    train2012 = train_store[(train_store['Year']==2012) & (train_store['Month']<9)]
    
    # 3가지(2010+2012, 2011+2012, 2010+2011+2012) 데이터프레임 생성
    train0 = pd.concat([train2010, train2012])
    train1 = pd.concat([train2011, train2012])
    train2 = pd.concat([train2010, train2011, train2012])
    
    # 학습
    model0 = cat
    model0.fit(train0[features], train0['Weekly_Sales'])
    
    model1 = cat
    model1.fit(train1[features], train1['Weekly_Sales'])
    
    model2 = cat
    model2.fit(train2[features], train2['Weekly_Sales'])
    
    # 예측
    y_val = train_store[(train_store['Year']==2012) & (train_store['Month']==9)]['Weekly_Sales']
    X_val = train_store[(train_store['Year']==2012) & (train_store['Month']==9)][features]

    pred0 = model0.predict(X_val)
    pred1 = model1.predict(X_val)
    pred2 = model2.predict(X_val)
    
    # RMSE로 예측 평가
    rmse0 = RMSE(pred0, y_val)
    rmse1 = RMSE(pred1, y_val)
    rmse2 = RMSE(pred2, y_val)
    
    # 3가지 RMSE 중 최소값을 이용하여 데이터프레임 추출
    if rmse0 < rmse1 and rmse0 < rmse2:
        error.append(rmse0)
        train_part = train_store[(train_store['Year']==2010) & (train_store['Month']<=10)]
    elif rmse1 < rmse2: 
        error.append(rmse1)
        train_part = train_store[(train_store['Year']==2011) & (train_store['Month']<=10)]
    else:
        error.append(rmse2)
        train_part = pd.concat([train_store[(train_store['Year']==2010) & (train_store['Month']<=10)],
                             train_store[(train_store['Year']==2011) & (train_store['Month']<=10)]])
          
    # 최종 학습 데이터
    train_final = pd.concat([train_part, train_store[train_store['Year']==2012]])
    X_train = train_final[features]
    y_train = train_final['Weekly_Sales']
    
    # 학습 및 저장
    model = RandomizedSearchCV(cat, param_distributions = params, n_iter = 30, cv = 5, n_jobs = -1)
    model.fit(X_train, y_train)
    models.append(model)
    joblib.dump(models[store], open('store{0}.model'.format(store), 'wb'))

print('평균 rmse :', np.mean(error)) # 37028.95670182583

#### 예측

In [11]:
# 예측
pred = []
for store in range(1, 46):
    test_store = test[test['Store']==store]
    y = models[store].predict(test_store[features])
    pred += list(y)

sample_submission["Weekly_Sales"] = pred
sample_submission.to_csv('cat.csv',index = False)
sample_submission.head(10)

Unnamed: 0,id,Weekly_Sales
0,1,1666681.0
1,2,1564931.0
2,3,1527834.0
3,4,1471042.0
4,5,1924328.0
5,6,1837175.0
6,7,1873774.0
7,8,1818205.0
8,9,419402.1
9,10,410048.2
