In [None]:
!python3 -m pip install pandas
!python3 -m pip install catboost
!python3 -m pip install numpy


In [None]:
# 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, log_loss, auc, accuracy_score, balanced_accuracy_score
from sklearn.metrics import make_scorer, RocCurveDisplay, confusion_matrix

from sklearn.base import clone

#https://dacon.io/en/competitions/official/236055/codeshare/7849
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:

train_data = pd.read_csv('/kaggle/input/playground-series-s4e7/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e7/test.csv')

train_data = train_data.drop(['id'], axis = 1) # 
test_data = test_data.drop(['id'], axis = 1) # 

# 3. 전처리

# 학습에 연관이 없을 id 칼럼 제거
# Data Memory reduction and one-hot encoding
train_data['Region_Code'] = train_data['Region_Code'].astype('int8')
test_data['Region_Code'] = test_data['Region_Code'].astype('int8')

train_data['Policy_Sales_Channel'] = train_data['Policy_Sales_Channel'].astype('int8')
test_data['Policy_Sales_Channel'] = test_data['Policy_Sales_Channel'].astype('int8')

train_data['Gender'] = train_data['Gender'].map({'Male': 1,'Female': 0}).astype('int8')
test_data['Gender'] = test_data['Gender'].map({'Male': 1,'Female': 0}).astype('int8')

train_data['Driving_License'] = train_data['Driving_License'].astype('int8')
test_data['Driving_License'] = test_data['Driving_License'].astype('int8')

train_data['Previously_Insured'] = train_data['Previously_Insured'].astype('int8')
test_data['Previously_Insured'] = test_data['Previously_Insured'].astype('int8')

train_data['Vintage'] = train_data['Vintage'].astype('int16')
test_data['Vintage'] = test_data['Vintage'].astype('int16')

train_data['Response'] = train_data['Response'].astype('int8')

train_data['Vehicle_Age'] = train_data['Vehicle_Age'].map({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}).astype('int8')
test_data['Vehicle_Age'] = test_data['Vehicle_Age'].map({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}).astype('int8')

train_data['Vehicle_Damage'] = train_data['Vehicle_Damage'].map({'Yes': 1, 'No': 0}).astype('int8')
test_data['Vehicle_Damage'] = test_data['Vehicle_Damage'].map({'Yes': 1, 'No': 0}).astype('int8')

# log scaling
train_data['Annual_Premium'] = np.log(train_data['Annual_Premium'])
test_data['Annual_Premium'] = np.log(test_data['Annual_Premium'])

train, test = train_test_split(train_data, test_size = 0.2, random_state=42)



In [None]:
test

In [None]:
categorical_features = test_data.columns.values
categorical_features = np.delete(categorical_features, np.where(categorical_features == 'Annual_Premium'))

In [None]:
def objective(trial):   
    categorical_features = test_data.columns.values
    categorical_features = np.delete(categorical_features, np.where(categorical_features == 'Annual_Premium'))

    catb_params = {
        'iterations': trial.suggest_int('iterations', 10000, 20000),
        'eval_metric': 'AUC',
        #'task_type': 'GPU',
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.075),
        'depth': trial.suggest_int('depth', 7, 9),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 55.37964307854247, 56.37964307854247),
        'max_bin': trial.suggest_int('max_bin', 404, 512),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.017138393608280057, 0.02),
        'random_strength': trial.suggest_float('random_strength', 9.256288011643901, 10.256288011643901),
    }
    # CatBoostClassifier 모델 생성
    model = CatBoostClassifier(
        **catb_params, 
        random_state = 42, 
        logging_level='Silent', 
        cat_features=categorical_features,
        )


    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    X = train.copy()
    Y = X.pop('Response')

    val_predictions = np.zeros(len(X))
    test_predictions = np.zeros(len(test))
    val_scores = []

    categorical_features = ['Gender', 'Region_Code', 'Policy_Sales_Channel']

    input_shape = {feature: int(train[feature].max()) for feature in categorical_features}

    val_predictions = np.zeros(len(X))
    test_predictions = np.zeros(len(test))
    train_scores, val_scores = [], []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, Y)):
        X_train = X.iloc[train_idx].reset_index(drop=True)
        y_train = Y.iloc[train_idx].reset_index(drop=True)
        X_val = X.iloc[val_idx].reset_index(drop=True)
        y_val = Y.iloc[val_idx].reset_index(drop=True)
        
        model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=True)
        best_iteration = model.get_best_iteration()
        train_preds_proba = model.predict_proba(X_train)[:, 1]
        val_preds_proba = model.predict_proba(X_val)[:, 1]
        test_preds_proba = model.predict_proba(test)[:, 1]
        
        val_predictions[val_idx] = val_preds_proba
        train_scores.append(roc_auc_score(y_train, train_preds_proba))
        val_scores.append(roc_auc_score(y_val, val_preds_proba))
        
        print(f'Fold {fold}: {val_scores[-1]:.5f}')
        
        test_predictions += test_preds_proba / cv.get_n_splits()
        
        prediction = model.predict_proba(test.drop('Response', axis = 1))[:, 1]
        auc = roc_auc_score(test['Response'], prediction)
        print(f' AUC: {auc}')

    return auc

sampler = TPESampler(seed = 42)

optuna_cbt = optuna.create_study(direction='maximize', sampler = sampler)
optuna_cbt.optimize(objective, n_trials = 10)

In [None]:
cbt_trial_params = optuna_cbt.best_trial.params

In [None]:
tuned_cbt_model = CatBoostClassifier(
    **cbt_trial_params, 
    random_state = 42, 
    logging_level='Silent', 
    cat_features=categorical_features,
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X = train.copy()
Y = X.pop('Response')

val_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(test))
val_scores = []

categorical_features = ['Gender', 'Region_Code', 'Policy_Sales_Channel']

input_shape = {feature: int(train[feature].max()) for feature in categorical_features}

val_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(test))
train_scores, val_scores = [], []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, Y)):
    X_train = X.iloc[train_idx].reset_index(drop=True)
    y_train = Y.iloc[train_idx].reset_index(drop=True)
    X_val = X.iloc[val_idx].reset_index(drop=True)
    y_val = Y.iloc[val_idx].reset_index(drop=True)
    
    tuned_cbt_model.fit(
        X_train, 
        y_train, 
        eval_set=(X_val, y_val), 
        early_stopping_rounds=50, 
        verbose=True,
        
    )
    
    best_iteration = tuned_cbt_model.get_best_iteration()
    train_preds_proba = tuned_cbt_model.predict_proba(X_train)[:, 1]
    val_preds_proba = tuned_cbt_model.predict_proba(X_val)[:, 1]
    test_preds_proba = tuned_cbt_model.predict_proba(test)[:, 1]
    
    #val_predictions[val_idx] = val_preds_proba
    #train_scores.append(roc_auc_score(y_train, train_preds_proba))
    #val_scores.append(roc_auc_score(y_val, val_preds_proba))
    
    #print(f'Fold {fold}: {val_scores[-1]:.5f}')
    
    #test_predictions += test_preds_proba / cv.get_n_splits()

#print(f'Val Score: {np.mean(val_scores):.7f} ± {np.std(val_scores):.7f} | Train Score: {np.mean(train_scores):.7f} ± {np.std(train_scores):.7f} | Response')



In [None]:

#accuracy
prediction = tuned_cbt_model.predict(test.drop('Response', axis = 1))
matches = test['Response'] == prediction
num_matches = matches.sum()
accuracy = num_matches / len(matches)
print(f' accuracy: ', accuracy)

#auc
probabilities = tuned_cbt_model.predict_proba(test_data)[:, 1]
#auc = roc_auc_score(test['Response'], probabilities)
#print(f' AUC: {auc}')

In [None]:
test_data = pd.read_csv('/kaggle/input/playground-series-s4e7/test.csv')
test_data['Response'] = probabilities
test_data[['id', 'Response']].to_csv("submission.csv", index = False)