In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
# 라이브러리
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

# 라이브러리
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
from lightgbm import LGBMRegressor
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Input,Dense,Dropout,BatchNormalization
from keras.callbacks import EarlyStopping
from sklearn.multioutput import ClassifierChain
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path='/content/drive/MyDrive/KT-BigProject/'

In [None]:
df=pd.read_csv(path+'data_hee.csv')
df=df.dropna()

In [None]:
# 전처리
# 구, 로 값 추가
df[['gu', 'ro']] = df['Address'].str.split(' ', expand=True).iloc[:, :2]
# 공급유형 정수형 전처리
df['Supply_type'] =df['Supply_type'].str.extract('(\d+\.?\d*)').astype(float).astype(int)

cutline_rate = df['Cutline_rate']
supply_type = df['Supply_type']

df=df.drop(
    columns=[
        'Name', 'Address', 'Latitude', 'Longitude','ro','Counts_daiso',
       'Counts_supermarket', 'Counts_laundry', 'Counts_pharmacy',
       'Counts_cafe'
    ]
)

df = pd.get_dummies(df)

df['Cutline_rate'] = cutline_rate
df['Supply_type'] = supply_type


# 전체 데이터프레임에서 (train+valid), (test)로 나눔
df_train_valid, df_test = train_test_split(df, test_size=0.3,
                                           stratify=df['Cutline_rate'],
                                           random_state=42)

# 80% 중에서 다시 train과 valid를 8:2
df_train, df_valid = train_test_split(df_train_valid,
                                      test_size=0.3,
                                      stratify=df_train_valid['Cutline_rate'],
                                      random_state=42)

# train에만 cutline_rate smote 적용
x=df_train.drop(columns='Cutline_rate')
y=df_train['Cutline_rate']

smote = SMOTE(sampling_strategy='auto', random_state=42)
x, y = smote.fit_resample(x, y)

df_train = pd.DataFrame(x, columns=x.columns).copy()
df_train['Cutline_rate'] = y

# Qty , 0~10 : 3순위 + 가점 , 20~30 : 2순위 + 가점 , 40~50 : 1순위 + 가점

qty1 = (3 - df_train['Cutline_rate']) * 11 + df_train['Cutline_score']
qty2 = (3 - df_valid['Cutline_rate']) * 11 + df_valid['Cutline_score']
qty3 = (3 - df_test['Cutline_rate']) * 11 + df_test['Cutline_score']
df_train['Qty'] = qty1
df_valid['Qty'] = qty2
df_test['Qty'] = qty3

# target = Qty , 그리고 qty 계산을 위해 남겨두었던 rate score 처리
cols=['Cutline_rate','Cutline_score','Qty']

x_train=df_train.drop(columns=cols)
y_train=df_train['Qty']

x_valid=df_valid.drop(columns=cols)
y_valid=df_valid['Qty']

x_test=df_test.drop(columns=cols)
y_test=df_test['Qty']

In [None]:
# 그리드 서치에 사용할 파라미터 그리드 설정
param_grid = {
    'iterations': [500, 1000],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.03, 0.1],
    'l2_leaf_reg': [3, 10, 20],
    'bagging_temperature': [1, 2, 3],
    'random_strength': [1, 3, 5]
}

# 모델은 최대한 간단하게
regressor = CatBoostRegressor(
    loss_function='RMSE',
    verbose=100,
    task_type='CPU'
)

# grid search 사용
grid_search_result = regressor.grid_search(
    param_grid,
    X=x_train,
    y=y_train,
    cv=5,
    partition_random_seed=42,
    shuffle=True
)


# grid_search_result는 딕셔너리 형태로 최적 파라미터와 결과를 포함합니다.
# 최적의 파라미터: {'bagging_temperature': 1, 'random_strength': 1, 'depth': 4, 'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 500}
print("최적의 파라미터:", grid_search_result['params'])
print("최적 점수:", grid_search_result['cv_results'])

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m

120:	loss: 3.2210730	best: 2.9280192 (2)	total: 5m 56s	remaining: 17m 55s
0:	learn: 18.7180097	test: 18.5502166	best: 18.5502166 (0)	total: 10.8ms	remaining: 10.8s
100:	learn: 4.4341785	test: 4.5542429	best: 4.5542429 (100)	total: 1.12s	remaining: 10s
200:	learn: 2.8255637	test: 3.4273974	best: 3.4273974 (200)	total: 2.25s	remaining: 8.96s
300:	learn: 2.2311127	test: 3.2142943	best: 3.2142943 (300)	total: 4.08s	remaining: 9.47s
400:	learn: 1.9130438	test: 3.1822051	best: 3.1821044 (394)	total: 6.06s	remaining: 9.05s
500:	learn: 1.6680090	test: 3.1485737	best: 3.1485737 (500)	total: 7.89s	remaining: 7.86s
600:	learn: 1.4849803	test: 3.1369642	best: 3.1369642 (600)	total: 9.02s	remaining: 5.99s
700:	learn: 1.3576253	test: 3.1341995	best: 3.1334263 (683)	total: 10.1s	remaining: 4.33s
800:	learn: 1.2388329	test: 3.1246915	best: 3.1246888 (761)	total: 11.2s	remaining: 2.78s
900:	learn: 1.1394140	test: 3.1238650	best: 3.1210939 (843)	total: 

In [None]:
#모델 저장하기  cbm : catboost 자체 확장자 , pkl은 기본
regressor.save_model('best2.cbm')
regressor.save_model('best2.pkl')

# 다시 호출
model=CatBoostRegressor()
model.load_model('best2.cbm')

y_pred=model.predict(x_test)
y_score_pred=y_pred.round().astype(int)
print('Score MAE:',mean_absolute_error(y_test,y_score_pred))
print('Score R2:',r2_score(y_test,y_score_pred))

Score MAE: 3.209125475285171
Score R2: 0.5350079846537652


In [None]:
# 검증
df_pred=pd.DataFrame(y_score_pred)
y_test.reset_index(drop=True,inplace=True )
df_compare=pd.concat([df_pred,y_test],axis=1)
df_compare=df_compare.rename(columns={0:'Qty_pred'})
df_compare

Unnamed: 0,Qty_pred,Qty
0,16,23.0
1,20,16.0
2,26,30.0
3,30,30.0
4,18,18.0
...,...,...
258,29,28.0
259,29,25.0
260,24,24.0
261,21,19.0
