In [1]:
# S5E8_00_Baseline
# 単純モデルの作成(LGBM,XGB,CatBoost)

In [2]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import os

# データフレーム読み込み
train_df = pd.read_csv("/kaggle/input/playground-series-s5e8/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s5e8/test.csv")

# データ結合
all_df = pd.concat([train_df,test_df],axis=0,ignore_index=True)
max_row = len(all_df)

In [3]:
all_df.info() # 特徴量、欠損、型確認

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 18 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   id         1000000 non-null  int64  
 1   age        1000000 non-null  int64  
 2   job        1000000 non-null  object 
 3   marital    1000000 non-null  object 
 4   education  1000000 non-null  object 
 5   default    1000000 non-null  object 
 6   balance    1000000 non-null  int64  
 7   housing    1000000 non-null  object 
 8   loan       1000000 non-null  object 
 9   contact    1000000 non-null  object 
 10  day        1000000 non-null  int64  
 11  month      1000000 non-null  object 
 12  duration   1000000 non-null  int64  
 13  campaign   1000000 non-null  int64  
 14  pdays      1000000 non-null  int64  
 15  previous   1000000 non-null  int64  
 16  poutcome   1000000 non-null  object 
 17  y          750000 non-null   float64
dtypes: float64(1), int64(8), object(9)
memory u

In [4]:
test_df.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,750000,32,blue-collar,married,secondary,no,1397,yes,no,unknown,21,may,224,1,-1,0,unknown
1,750001,44,management,married,tertiary,no,23,yes,no,cellular,3,apr,586,2,-1,0,unknown
2,750002,36,self-employed,married,primary,no,46,yes,yes,cellular,13,may,111,2,-1,0,unknown
3,750003,58,blue-collar,married,secondary,no,-1380,yes,yes,unknown,29,may,125,1,-1,0,unknown
4,750004,28,technician,single,secondary,no,1950,yes,no,cellular,22,jul,181,1,-1,0,unknown


In [5]:
# 数値列とカテゴリ列を取得
num_col = []
cat_col = []

for col in train_df.columns:
    if train_df[col].dtypes!="object":
        num_col.append(col)
    else:
        cat_col.append(col)

In [6]:
# ラベルエンコード
from sklearn.preprocessing import LabelEncoder

for col in cat_col:
    le = LabelEncoder()
    all_df[col] = le.fit_transform(all_df[col].values)

In [7]:
# 訓練データとテストデータに分離
train = all_df[:len(train_df)]
test = all_df[len(train_df):]

# 訓練データをx,yに分割
X = train.drop(["id","y"],axis=1)
y = train["y"]

In [8]:
from sklearn.model_selection import train_test_split

# Hold-out
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,stratify=y, 
                                                    random_state=42)

In [9]:
###################################################
############ Light GBM ############################
###################################################
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

# 学習、バリデーションデータ
X_train_lgb = X_train.copy()
X_valid_lgb = X_valid.copy()

# object型をカテゴリ型に変換
for col in cat_col:
    X_train_lgb[col] = X_train[col].astype("category")
    X_valid_lgb[col] = X_valid[col].astype("category")

# データセット作成
lgb_train = lgb.Dataset(X_train_lgb,y_train,reference = lgb_train)
lgb_valid = lgb.Dataset(X_valid_lgb,y_valid,reference = lgb_train)

# パラメータ
lgbm_params = {
    'objective': 'binary',
    "device": "cpu",
    'metric': 'auc',
    'verbose': -1,              # ログ出力の制御
    'boosting_type': 'gbdt',
    # 'learning_rate': 0.01,
    # 'feature_fraction': 0.6956717916553479,
    # 'num_leaves':       153,
    # 'bagging_fraction': 0.5279852787927486,
    # 'bagging_freq':     4,
    # 'lambda_l1':        0.004603414256652151,
    # 'lambda_l2':        0.013858762846118894,
    # 'min_data_in_leaf': 87,
    }

# 履歴
evaluations_result = {}

# 学習
model_lgb = lgb.train(
    lgbm_params,
    lgb_train,
    num_boost_round=2000,
    valid_sets=[lgb_train,lgb_valid],
    valid_names=["train","valid"],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100,verbose=False),
        lgb.record_evaluation(evaluations_result),
        lgb.log_evaluation(100),])

NameError: name 'lgb_train' is not defined

In [None]:
# 予測
lgb_pred = model_lgb.predict(X_valid_lgb,num_iteration=model_lgb.best_iteration)

from sklearn.metrics import f1_score
print(roc_auc_score(y_valid,lgb_pred))
print(f1_score(y_valid,np.round(lgb_pred,0)))

In [None]:
#################################################
############ XGBoost ############################
#################################################
import xgboost as xgb

# DMatrixに変換
dtrain = xgb.DMatrix(X_train,label=y_train)
dvalid = xgb.DMatrix(X_valid,label=y_valid)

# パラメータ
xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
}

# 学習
model_xgb = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=2000,
    evals=[(dtrain,"train"),(dvalid,"valid")],
    early_stopping_rounds=100,
    verbose_eval=100,
)

In [None]:
# 予測
xgb_pred = model_xgb.predict(dvalid,
                             iteration_range=(0,model_xgb.best_iteration+1))

from sklearn.metrics import f1_score
print(roc_auc_score(y_valid,xgb_pred))
print(f1_score(y_valid,np.round(xgb_pred,0)))

In [None]:
#################################################
############ CatBoost ############################
#################################################
from catboost import Pool, train

# 学習データ
X_train_cat = X_train
X_valid_cat = X_valid

# object型をカテゴリ型に変換
for col in cat_col:
    X_train_cat[col] = X_train[col].astype("category")
    X_valid_cat[col] = X_valid[col].astype("category")

# データセット設定
train_pool = Pool(X_train_cat,y_train,cat_features=cat_col)
valid_pool = Pool(X_valid_cat,y_valid,cat_features=cat_col)

cat_params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "iterations": 2000,
    "verbose": 100,
}

model_cb = train(
    params=cat_params,
    dtrain=train_pool,
    eval_set=valid_pool,
    early_stopping_rounds=100)

In [None]:
model_cb.best_iteration_

In [None]:
# 予測
cat_pred = model_cb.predict(valid_pool,
                            prediction_type="Probability",
                           # iteration_range=(0,model_cb.best_iteration_+1)
                           )

from sklearn.metrics import f1_score
print(roc_auc_score(y_valid,cat_pred[:,1]))
print(f1_score(y_valid,np.round(cat_pred[:,1],0)))