## 임포트

In [16]:
!pip install -U teddynote

from teddynote import models
# Data Wrangling
import pandas as pd
import numpy as np

#Utility
import random
import os

# Preprocessing & Feature Engineering
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectPercentile
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

# Optuna
import optuna
from optuna.samplers import TPESampler
from optuna import Trial

# Modeling
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostClassifier, Pool, cv

# Visualization
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

pd.set_option('mode.chained_assignment',  None)

Collecting teddynote
  Downloading teddynote-0.2.1-py3-none-any.whl (10 kB)
Collecting xgboost
  Downloading xgboost-1.6.2-py3-none-win_amd64.whl (125.4 MB)
     ------------------------------------- 125.4/125.4 MB 16.4 MB/s eta 0:00:00
Installing collected packages: xgboost, teddynote
Successfully installed teddynote-0.2.1 xgboost-1.6.2


In [17]:
class CFG:
    SEED = 42

In [18]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

## 데이터 불러오기

In [19]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [20]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class','father','mother','gender'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [21]:
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

## Data Pre-processing
### Label-Encoding

In [22]:
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

In [23]:
snp_data = []
for col in snp_col:
    snp_data += list(train_x[col].values)

In [24]:
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)

LabelEncoder()

In [25]:
for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

## Model Fit

In [None]:
model = models.CatBoostClassifierOptuna()
preds = model.optimize(train_x, train_y, eval_metric='accuracy', n_trials=3)

[32m[I 2022-12-21 16:10:02,341][0m A new study created in memory with name: no-name-fb309279-c8dd-4207-b9f1-a25e308eba81[0m


metric type: f1, score: 0.98103
metric type: f1, score: 0.96226
metric type: f1, score: 0.94184
metric type: f1, score: 0.92308


[32m[I 2022-12-21 16:10:04,650][0m Trial 0 finished with value: 0.9577797108587018 and parameters: {'bootstrap_type': 'Bernoulli', 'boosting_type': 'Ordered', 'od_type': 'Iter', 'colsample_bylevel': 0.06758243173036939, 'l2_leaf_reg': 0.3632364572746973, 'learning_rate': 0.01468882024773116, 'iterations': 768, 'min_child_samples': 6, 'depth': 6, 'subsample': 0.738716294642164}. Best is trial 0 with value: 0.9577797108587018.[0m


metric type: f1, score: 0.98069
metric type: f1, score: 0.92453
metric type: f1, score: 0.96187
metric type: f1, score: 0.98075
metric type: f1, score: 1.00000


[32m[I 2022-12-21 16:10:06,056][0m Trial 1 finished with value: 0.9619385488725112 and parameters: {'bootstrap_type': 'MVS', 'boosting_type': 'Ordered', 'od_type': 'Iter', 'colsample_bylevel': 0.0584721792282906, 'l2_leaf_reg': 0.06736259339127611, 'learning_rate': 0.05167303615575541, 'iterations': 644, 'min_child_samples': 26, 'depth': 6}. Best is trial 1 with value: 0.9619385488725112.[0m


metric type: f1, score: 0.94254
metric type: f1, score: 0.90537
metric type: f1, score: 0.94348
metric type: f1, score: 0.96074
metric type: f1, score: 0.98037


[32m[I 2022-12-21 16:10:08,087][0m Trial 2 finished with value: 0.9502983198849536 and parameters: {'bootstrap_type': 'MVS', 'boosting_type': 'Ordered', 'od_type': 'IncToDec', 'colsample_bylevel': 0.024172911047752838, 'l2_leaf_reg': 0.020202303053480482, 'learning_rate': 0.017326259783241413, 'iterations': 810, 'min_child_samples': 1, 'depth': 8}. Best is trial 1 with value: 0.9619385488725112.[0m


metric type: f1, score: 0.96154
metric type: f1, score: 0.96226
metric type: f1, score: 0.94360
metric type: f1, score: 0.98066
metric type: f1, score: 0.92009


[32m[I 2022-12-21 16:10:08,895][0m Trial 3 finished with value: 0.9574485450653849 and parameters: {'bootstrap_type': 'Bernoulli', 'boosting_type': 'Ordered', 'od_type': 'IncToDec', 'colsample_bylevel': 0.040160461941300477, 'l2_leaf_reg': 2.2323172374305028e-07, 'learning_rate': 0.20189879904457053, 'iterations': 1057, 'min_child_samples': 32, 'depth': 12, 'subsample': 0.7766237053808711}. Best is trial 1 with value: 0.9619385488725112.[0m


metric type: f1, score: 0.98063
metric type: f1, score: 0.98110
metric type: f1, score: 0.96226
metric type: f1, score: 0.96154
metric type: f1, score: 0.94276


[32m[I 2022-12-21 16:10:10,240][0m Trial 4 finished with value: 0.9617131822057748 and parameters: {'bootstrap_type': 'MVS', 'boosting_type': 'Plain', 'od_type': 'IncToDec', 'colsample_bylevel': 0.01994950815412165, 'l2_leaf_reg': 0.16748238972266352, 'learning_rate': 0.060474472251705186, 'iterations': 1650, 'min_child_samples': 27, 'depth': 5}. Best is trial 1 with value: 0.9619385488725112.[0m


metric type: f1, score: 0.96090
metric type: f1, score: 1.00000
metric type: f1, score: 0.94283
metric type: f1, score: 1.00000


[32m[I 2022-12-21 16:10:10,905][0m Trial 5 finished with value: 0.9688990128426042 and parameters: {'bootstrap_type': 'Bayesian', 'boosting_type': 'Plain', 'od_type': 'Iter', 'colsample_bylevel': 0.0170401066397725, 'l2_leaf_reg': 0.05460350356120117, 'learning_rate': 0.07894382088056237, 'iterations': 446, 'min_child_samples': 12, 'depth': 2, 'bagging_temperature': 34.01883843740292}. Best is trial 5 with value: 0.9688990128426042.[0m


metric type: f1, score: 0.96028
metric type: f1, score: 0.94138
metric type: f1, score: 0.96331
metric type: f1, score: 0.94271
metric type: f1, score: 0.98062
metric type: f1, score: 0.96135


[32m[I 2022-12-21 16:10:12,495][0m Trial 6 finished with value: 0.9539724793172283 and parameters: {'bootstrap_type': 'Bayesian', 'boosting_type': 'Plain', 'od_type': 'Iter', 'colsample_bylevel': 0.03365979882345982, 'l2_leaf_reg': 0.019745238622118615, 'learning_rate': 0.015092922390744213, 'iterations': 1179, 'min_child_samples': 26, 'depth': 3, 'bagging_temperature': 29.431235679678412}. Best is trial 5 with value: 0.9688990128426042.[0m


metric type: f1, score: 0.92188
metric type: f1, score: 0.98111
metric type: f1, score: 0.90628
metric type: f1, score: 0.94291


[32m[I 2022-12-21 16:10:13,581][0m Trial 7 finished with value: 0.9388231688689253 and parameters: {'bootstrap_type': 'MVS', 'boosting_type': 'Ordered', 'od_type': 'IncToDec', 'colsample_bylevel': 0.04236830752993776, 'l2_leaf_reg': 2.33627261323977, 'learning_rate': 0.20565031019552982, 'iterations': 675, 'min_child_samples': 32, 'depth': 4}. Best is trial 5 with value: 0.9688990128426042.[0m


metric type: f1, score: 0.98091
metric type: f1, score: 0.88291
metric type: f1, score: 0.96226
metric type: f1, score: 0.94301
metric type: f1, score: 0.96154
metric type: f1, score: 0.96154


[32m[I 2022-12-21 16:10:13,938][0m Trial 8 finished with value: 0.9617950027699933 and parameters: {'bootstrap_type': 'MVS', 'boosting_type': 'Plain', 'od_type': 'Iter', 'colsample_bylevel': 0.03532457313948974, 'l2_leaf_reg': 0.0021625561732788383, 'learning_rate': 0.3671473930173935, 'iterations': 1206, 'min_child_samples': 16, 'depth': 2}. Best is trial 5 with value: 0.9688990128426042.[0m


metric type: f1, score: 0.98063
metric type: f1, score: 0.96239
metric type: f1, score: 0.90379
metric type: f1, score: 0.96154
metric type: f1, score: 1.00000


[32m[I 2022-12-21 16:10:14,511][0m Trial 9 finished with value: 0.9578511412854237 and parameters: {'bootstrap_type': 'Bernoulli', 'boosting_type': 'Plain', 'od_type': 'Iter', 'colsample_bylevel': 0.03972693170715833, 'l2_leaf_reg': 1.0391453380836638e-07, 'learning_rate': 0.14398265004448105, 'iterations': 559, 'min_child_samples': 31, 'depth': 12, 'subsample': 0.5595775852873337}. Best is trial 5 with value: 0.9688990128426042.[0m


metric type: f1, score: 0.96154
metric type: f1, score: 0.94317
metric type: f1, score: 0.96226


In [14]:
submit = pd.read_csv('./sample_submission.csv')
submit['class'] = class_le.inverse_transform(preds)
submit.to_csv('./answer/submit_catboost.csv', index=False)