# Train Model code
Using CatBoost(Boosting) Algorithm

In [1]:
# For colab
from google.colab import drive
drive.mount('/content/Mydrive')
%cd "/content/Mydrive/MyDrive/Github/KT_Devchall"

Mounted at /content/Mydrive
/content/Mydrive/MyDrive/Github/KT_Devchall


In [2]:
%%capture
!pip install catboost

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [4]:
df_train = pd.read_pickle('data/train_no_ID_treat.pkl') 
# df_train = pd.read_pickle('data/train.pkl') # 7s

In [5]:
# Splitting Dataset

X = df_train.drop('Class', axis=1)
y = df_train['Class']

num_features = ['Ex_Rate','P1','P2','P3','P4']
cat_features = X.columns.drop(num_features).tolist()
cat_idx = [idx for idx, val in enumerate(X.columns) if val in cat_features]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.4, random_state = 123, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size = 0.5, random_state =123, stratify=y_val)

## Arbitrarily Hyperparameter setting 

In [None]:
# Model Setting

clf = CatBoostClassifier(
    iterations=5000, 
    learning_rate=0.05, 
    random_seed=123, 
    task_type='GPU',
    one_hot_max_size=3,
    loss_function = 'Logloss',
    od_type='IncToDec',
    od_pval=0.001
)

In [None]:
# Model fitting

clf.fit(
    X_train, y_train,
    cat_features=cat_idx,
    verbose=50,
    eval_set = (X_val, y_val)
)

In [None]:
clf.save_model("ID_nottreated_ohm_3_earlystopped.cbm")

In [None]:
clf.get_params()

{'iterations': 5000,
 'learning_rate': 0.05,
 'loss_function': 'Logloss',
 'od_pval': 0.0001,
 'od_type': 'IncToDec',
 'random_seed': 123,
 'one_hot_max_size': 3,
 'task_type': 'GPU'}

In [None]:
# Test data accuaracy
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

y_pred = clf.predict(X_test, prediction_type='Class')

In [None]:
# Best Model Loss and Metrics
best_score = clf.best_score_['validation']['Logloss']
print(f'Best Validation Loss : {round(best_score, 4)}')

Best Validation Loss : 0.2913


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names = ['Lose', 'Win']))

              precision    recall  f1-score   support

        Lose       0.87      0.94      0.90   1202206
         Win       0.81      0.68      0.74    502897

    accuracy                           0.86   1705103
   macro avg       0.84      0.81      0.82   1705103
weighted avg       0.86      0.86      0.86   1705103



# Best till Now
- params : {'iterations': 5000,
 'learning_rate': 0.05,
 'loss_function': 'Logloss',
 'od_pval': 0.0001,
 'od_type': 'IncToDec',
 'random_seed': 123,
 'one_hot_max_size': 3,
 'task_type': 'GPU'}

> val_score = 0.2913 \\
> acc_score = 0.86

: class=1인 데이터 f1-score low

## GridSearch

In [None]:
# Model Setting

clf_grid = CatBoostClassifier(
    iterations=1000, 
    random_seed=123, 
    task_type='GPU',
    loss_function = 'Logloss',
    verbose=100,
    one_hot_max_size=3
)

In [None]:
# grid
grid = {
      'learning_rate' : [0.01,0.1],
      'depth' : [4, 6, 8],
      'l2_leaf_reg' : [1,3,5]
}

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
# Tuning
grid_search_result = clf_grid.grid_search(grid, X=X_train, y=y_train)

In [14]:
grid_search_result['params']

{'depth': 8, 'l2_leaf_reg': 5, 'learning_rate': 0.1}

### gridsearch result
{'depth': 8, 'l2_leaf_reg': 5, 'learning_rate': 0.1}

In [18]:
clf_grid.fit(X_train, y_train,
    cat_features=cat_idx,
    verbose=100,
    eval_set = (X_val, y_val))

0:	learn: 0.5685006	test: 0.5686051	best: 0.5686051 (0)	total: 420ms	remaining: 6m 59s
100:	learn: 0.3026780	test: 0.3028291	best: 0.3028291 (100)	total: 48.8s	remaining: 7m 14s
200:	learn: 0.2965747	test: 0.2968283	best: 0.2968283 (200)	total: 1m 36s	remaining: 6m 22s
300:	learn: 0.2942479	test: 0.2946854	best: 0.2946854 (300)	total: 2m 24s	remaining: 5m 34s
400:	learn: 0.2930851	test: 0.2937212	best: 0.2937212 (400)	total: 3m 10s	remaining: 4m 44s
500:	learn: 0.2921347	test: 0.2929411	best: 0.2929411 (500)	total: 3m 57s	remaining: 3m 56s
600:	learn: 0.2914986	test: 0.2924822	best: 0.2924822 (600)	total: 4m 44s	remaining: 3m 8s
700:	learn: 0.2908974	test: 0.2920527	best: 0.2920527 (700)	total: 5m 31s	remaining: 2m 21s
800:	learn: 0.2903882	test: 0.2917370	best: 0.2917370 (800)	total: 6m 18s	remaining: 1m 34s
900:	learn: 0.2899584	test: 0.2914877	best: 0.2914877 (900)	total: 7m 5s	remaining: 46.8s
999:	learn: 0.2895731	test: 0.2912807	best: 0.2912807 (999)	total: 7m 51s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7ff8a7ebe590>

In [20]:
# Prediction Score
y_pred = clf_grid.predict(X_test, prediction_type='Class')

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names = ['Lose', 'Win']))

              precision    recall  f1-score   support

        Lose       0.87      0.94      0.90   1202206
         Win       0.82      0.68      0.74    502897

    accuracy                           0.86   1705103
   macro avg       0.84      0.81      0.82   1705103
weighted avg       0.86      0.86      0.86   1705103



In [21]:
clf_grid.feature_importances_

array([3.12768611e+01, 6.62209523e+00, 1.55948023e+01, 1.05818328e+00,
       5.35072461e-02, 0.00000000e+00, 3.21271373e+00, 2.57594949e-02,
       1.17493059e+01, 9.91394069e-02, 9.48666051e+00, 9.53121732e-01,
       5.07253124e+00, 8.31159974e+00, 5.98996443e-02, 3.63612996e+00,
       2.78768950e+00])

In [10]:
pd.Series([3.12768611e+01, 6.62209523e+00, 1.55948023e+01, 1.05818328e+00,
       5.35072461e-02, 0.00000000e+00, 3.21271373e+00, 2.57594949e-02,
       1.17493059e+01, 9.91394069e-02, 9.48666051e+00, 9.53121732e-01,
       5.07253124e+00, 8.31159974e+00, 5.98996443e-02, 3.63612996e+00,
       2.78768950e+00], index = X_train.columns)

ADID_type     31.276861
DSP_ID         6.622095
Media_ID      15.594802
Adunit_ID      1.058183
Platform       0.053507
OS_type        0.000000
Size_ID        3.212714
Ex_Rate        0.025759
Category      11.749306
Country_ID     0.099139
P1             9.486661
P2             0.953122
P3             5.072531
P4             8.311600
weekend        0.059900
hour           3.636130
dayofweek      2.787689
dtype: float64

## Variable Selection
- Feature Importance가 낮은 변수 제거
> Platform, OS_type, Ex_Rate, Country_ID, weekend
- MLP model에 이용