In [36]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e10/sample_submission.csv
/kaggle/input/playground-series-s4e10/train.csv
/kaggle/input/playground-series-s4e10/test.csv


# 필요한 라이브러리 가져오기

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

import numpy as np
import pandas as pd
from scipy.stats import uniform, randint

# 데이터 가져오기

In [38]:
train = pd.read_csv("/kaggle/input/playground-series-s4e10/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e10/test.csv")
submission = pd.read_csv("/kaggle/input/playground-series-s4e10/sample_submission.csv")

train.shape, test.shape, submission.shape

((58645, 13), (39098, 12), (39098, 2))

# 데이터 확인

In [39]:
train.head(1)

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0


In [40]:
test.head(1)

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2


In [41]:
submission.head(1)

Unnamed: 0,id,loan_status
0,58645,0.5


# 데이터 가공

## ID 추출

In [42]:
train_ID = train.pop('id')
y = train.pop('loan_status')
test_ID = test.pop('id')

## 컬럼 분리

In [43]:
cat_cols = train.select_dtypes(exclude = np.number).columns.tolist()
num_cols = train.select_dtypes(include = np.number).columns.tolist()
print(cat_cols, num_cols)

['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'] ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']


# 모델 만들기

## 훈련데이터와 테스트 데이터셋 분리

In [44]:
X_tr, X_val, y_tr, y_val = train_test_split(
    train, y,
    test_size=0.3,
    random_state=42
)

X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((41051, 11), (17594, 11), (41051,), (17594,))

## 파이프라인 모델

In [45]:
# Cat Boost로 변경 
from catboost import CatBoostClassifier

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

"""
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

param_distributions = {
    'classifier__min_impurity_decrease': uniform(0.0001, 0.001),
    'classifier__max_depth': randint(20, 50),
    'classifier__min_samples_split': randint(2, 25),
    'classifier__min_samples_leaf': randint(1, 25),
}
"""

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(random_state=42, silent=True))
])

# Parameter distribution for CatBoostClassifier
param_distributions = {
    'classifier__depth': randint(4, 12),
    'classifier__learning_rate': uniform(0.01, 0.2),
    'classifier__iterations': randint(100, 1000),
    'classifier__l2_leaf_reg': uniform(1, 10),
}



split_number = 5
# stratified_kfold, 분류모형 만들 때
stratified_kfold = StratifiedKFold(n_splits=split_number, shuffle=True, random_state=42)

kfold = KFold(n_splits=split_number, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=50,
    cv=stratified_kfold, # stratified_kfold, 분류모형 만들 때
    scoring='roc_auc',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_tr, y_tr)

## 모형 평가

In [46]:
def get_score(model, X_tr, X_val, y_tr, y_val):
    tr_pred = model.predict_proba(X_tr)[:, 1] # 만약 확률로 구할시, predict_proba()[:, 1]
    val_pred = model.predict_proba(X_val)[:, 1]
    tr_score = roc_auc_score(y_tr, tr_pred)
    val_score = roc_auc_score(y_val, val_pred)
    return f"train: {tr_score}, validation: {val_score}"

get_score(random_search, X_tr, X_val, y_tr, y_val)

'train: 0.9256171582015471, validation: 0.9243782802524703'

# 모델 제출

In [47]:
final_preds = random_search.predict_proba(test)[:, 1]
submission['loan_status'] = final_preds
submission.to_csv("submission.csv", index=False)