In [161]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import xgboost as xgb, lightgbm as lgbm, catboost as catb

from sklearn.model_selection import train_test_split

In [162]:
train_data = pd.read_csv('assignment_2_train.csv', sep=',')
test_data = pd.read_csv('assignment_2_test.csv', sep=',')

In [163]:
train_data.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [164]:
train_data.shape, test_data.shape

((180000, 394), (100001, 394))

In [165]:
train_data.isnull().sum()

TransactionID          0
isFraud                0
TransactionDT          0
TransactionAmt         0
ProductCD              0
                   ...  
V335              132004
V336              132004
V337              132004
V338              132004
V339              132004
Length: 394, dtype: int64

In [166]:
target = 'isFraud'
#разбиваем фичи на типы
base_features = train_data.columns.drop([target]).tolist()
cat_features = train_data.select_dtypes(include='object').columns.tolist()
num_features = train_data.columns.drop([target] + cat_features).tolist()

In [167]:
def missing_num_features(X):
    for feature in num_features:
        X.loc[(X[feature].isna()) | \
               (X[feature] > X[feature].quantile(.975)) | \
               (X[feature] < X[feature].quantile(.025)), feature] = X[feature].median()
    return X

In [168]:
train_data = missing_num_features(train_data)

In [175]:
X = train_data[num_features]
y = train_data[target]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

## Обучение без категориальных переменных

### XGBoost

In [None]:
params_xgb = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.1,
    "n_estimators": 3000,
    "reg_lambda": 50,
    "max_depth": 20,
    "gamma": 5,
    "nthread": 4,
    "seed": 29
}

In [179]:
xgb_model_without_cat = xgb.XGBClassifier(**params_xgb)
xgb_model_without_cat.fit(
    X=X_train,
    y=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    early_stopping_rounds=20,
    eval_metric="auc",
    verbose=10
)



[0]	validation_0-auc:0.68706	validation_1-auc:0.68248
[10]	validation_0-auc:0.82771	validation_1-auc:0.81992
[20]	validation_0-auc:0.84399	validation_1-auc:0.83529
[30]	validation_0-auc:0.87790	validation_1-auc:0.87151
[40]	validation_0-auc:0.89490	validation_1-auc:0.88611
[50]	validation_0-auc:0.90561	validation_1-auc:0.89630
[60]	validation_0-auc:0.91465	validation_1-auc:0.90256
[70]	validation_0-auc:0.92179	validation_1-auc:0.90787
[80]	validation_0-auc:0.92654	validation_1-auc:0.91121
[90]	validation_0-auc:0.93149	validation_1-auc:0.91465
[100]	validation_0-auc:0.93463	validation_1-auc:0.91662
[110]	validation_0-auc:0.93709	validation_1-auc:0.91815
[120]	validation_0-auc:0.93875	validation_1-auc:0.91965
[130]	validation_0-auc:0.94072	validation_1-auc:0.92080
[140]	validation_0-auc:0.94148	validation_1-auc:0.92122
[150]	validation_0-auc:0.94148	validation_1-auc:0.92122
[154]	validation_0-auc:0.94148	validation_1-auc:0.92122


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='auc',
              gamma=5, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=10, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=4, nthread=4,
              num_parallel_tree=1, random_state=29, reg_alpha=0, reg_lambda=50,
              scale_pos_weight=1, seed=29, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

### LightGBM

In [199]:
params_lgbm = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.1,
    "n_estimators": 3000,
    "n_jobs": 10,
    "max_depth": 30,
    "seed": 29
}

In [200]:
lgbm_model_without_cat = lgbm.LGBMClassifier(**params_lgbm)
lgbm_model_without_cat.fit(
    X=X_train,
    y=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    early_stopping_rounds=25,
    eval_metric="auc",
    verbose=10
)

[10]	training's auc: 0.882063	valid_1's auc: 0.87838
[20]	training's auc: 0.902398	valid_1's auc: 0.892691
[30]	training's auc: 0.916838	valid_1's auc: 0.901541
[40]	training's auc: 0.929146	valid_1's auc: 0.908444
[50]	training's auc: 0.939271	valid_1's auc: 0.915171
[60]	training's auc: 0.945553	valid_1's auc: 0.917893
[70]	training's auc: 0.950606	valid_1's auc: 0.919442
[80]	training's auc: 0.95599	valid_1's auc: 0.922586
[90]	training's auc: 0.959541	valid_1's auc: 0.924521
[100]	training's auc: 0.961686	valid_1's auc: 0.925701
[110]	training's auc: 0.964265	valid_1's auc: 0.927433
[120]	training's auc: 0.967162	valid_1's auc: 0.928697
[130]	training's auc: 0.969394	valid_1's auc: 0.929928
[140]	training's auc: 0.972042	valid_1's auc: 0.93097
[150]	training's auc: 0.974008	valid_1's auc: 0.931746
[160]	training's auc: 0.975613	valid_1's auc: 0.93352
[170]	training's auc: 0.976959	valid_1's auc: 0.934537
[180]	training's auc: 0.978842	valid_1's auc: 0.934791
[190]	training's auc: 0

LGBMClassifier(max_depth=30, metric='auc', n_estimators=3000, n_jobs=10,
               objective='binary', seed=29)

## Обучение на всех признаках

In [None]:
#функция обработки пропускув в категориальных фичах
def missing_cat_features(X):
    for feature in cat_features:
        X.loc[X[feature].isna(), feature] = X[feature].mode()[0]
    return X

In [None]:
train_data = missing_cat_features(train_data)

In [203]:
#кодируем все категориальные признаки с помощью get_dummies
train_data = pd.get_dummies(train_data, columns = cat_features, prefix_sep = "_", drop_first = True) 
test_data = pd.get_dummies(test_data, columns = cat_features, prefix_sep = "_", drop_first = True) 
train_data.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,M1_T,M2_T,M3_T,M4_M1,M4_M2,M5_T,M6_T,M7_T,M8_T,M9_T
0,3076999.5,0,1884075,68.5,13926,375.0,150.0,142.0,315.0,87.0,...,1,1,1,0,1,0,1,0,0,1
1,3076999.5,0,1884075,29.0,2755,404.0,150.0,102.0,325.0,87.0,...,1,1,1,0,0,1,1,0,0,1
2,3076999.5,0,1884075,59.0,4663,490.0,150.0,166.0,330.0,87.0,...,1,1,1,0,0,0,0,0,0,0
3,3076999.5,0,1884075,50.0,9633,567.0,150.0,117.0,476.0,87.0,...,1,1,1,0,0,1,0,0,0,1
4,3076999.5,0,1884075,50.0,4497,514.0,150.0,102.0,420.0,87.0,...,1,1,1,0,0,0,0,0,0,1


In [204]:
train_data.shape

(180000, 517)

In [206]:
num_feature_after_preproc = train_data.columns.drop([target]).tolist()
len(num_feature_after_preproc)

516

In [207]:
X = train_data[num_feature_after_preproc]
y = train_data[target]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [None]:
params_xgb = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.1,
    "n_estimators": 3000,
    "reg_lambda": 50,
    "max_depth": 20,
    "gamma": 5,
    "nthread": 4,
    "seed": 29
}

In [208]:
xgb_model_with_cat = xgb.XGBClassifier(**params_xgb)
xgb_model_with_cat.fit(
    X=X_train,
    y=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    early_stopping_rounds=20,
    eval_metric="auc",
    verbose=10
)



[0]	validation_0-auc:0.68706	validation_1-auc:0.68248
[10]	validation_0-auc:0.82957	validation_1-auc:0.82026
[20]	validation_0-auc:0.86766	validation_1-auc:0.85698
[30]	validation_0-auc:0.89096	validation_1-auc:0.88062
[40]	validation_0-auc:0.90583	validation_1-auc:0.89489
[50]	validation_0-auc:0.91738	validation_1-auc:0.90213
[60]	validation_0-auc:0.92669	validation_1-auc:0.90915
[70]	validation_0-auc:0.93244	validation_1-auc:0.91325
[80]	validation_0-auc:0.93728	validation_1-auc:0.91639
[90]	validation_0-auc:0.94020	validation_1-auc:0.91913
[100]	validation_0-auc:0.94298	validation_1-auc:0.92166
[110]	validation_0-auc:0.94561	validation_1-auc:0.92375


KeyboardInterrupt: 

In [None]:
params_lgbm = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.1,
    "n_estimators": 3000,
    "n_jobs": 10,
    "max_depth": 30,
    "seed": 29
}

In [None]:
lgbm_model_with_cat = lgbm.LGBMClassifier(**params_lgbm)
lgbm_model_with_cat.fit(
    X=X_train,
    y=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    early_stopping_rounds=25,
    eval_metric="auc",
    verbose=10
)

## catboost

In [None]:
params_catb = {
    "n_estimators": 3000,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 0.1,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42
}

In [195]:
catb_model_with_cat = catb.CatBoostClassifier(**params_catb)
catb_model_with_cat.fit(
    X=X_train,
    y=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.6078127	test1: 0.6017316	best: 0.6017316 (0)	total: 103ms	remaining: 5m 8s
10:	test: 0.7999502	test1: 0.7894058	best: 0.7894058 (10)	total: 1.21s	remaining: 5m 27s
20:	test: 0.8053692	test1: 0.7967832	best: 0.7967832 (20)	total: 2.35s	remaining: 5m 33s
30:	test: 0.8282990	test1: 0.8249925	best: 0.8253179 (28)	total: 3.59s	remaining: 5m 43s
40:	test: 0.8317307	test1: 0.8276816	best: 0.8281166 (35)	total: 4.77s	remaining: 5m 44s
50:	test: 0.8399939	test1: 0.8362429	best: 0.8362429 (50)	total: 6.06s	remaining: 5m 50s
60:	test: 0.8484316	test1: 0.8446597	best: 0.8446597 (60)	total: 7.3s	remaining: 5m 51s
70:	test: 0.8523578	test1: 0.8484551	best: 0.8484551 (70)	total: 8.6s	remaining: 5m 54s
80:	test: 0.8551114	test1: 0.8511726	best: 0.8511726 (80)	total: 9.86s	remaining: 5m 55s
90:	test: 0.8591286	test1: 0.8556459	best: 0.8556459 (90)	total: 11.2s	remaining: 5m 56s
100:	test: 0.8613628	test1: 0.8572508	best: 0.8572508 (100)	total: 12.4s	remaining: 5m 57s
110:	test: 0.8650531	tes

<catboost.core.CatBoostClassifier at 0x1ee8fe59790>

In [None]:
params = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.1,
    "n_estimators": 1000,
    "reg_lambda": 50,
    "max_depth": 10,
    "gamma": 5,
    "nthread": 4,
    "seed": 29
}

In [None]:
xgb_model_without_cat = xgb.XGBClassifier(**params)
xgb_model_without_cat.fit(
    X=X_train,
    y=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    num_boost_round=300,
    early_stopping_rounds=20,
    eval_metric="auc",
    verbose=10
)

## LightGBM

Итак, для последующего обучения моделей используем следующие группы признаков:

    1. num_features - для обучения моделей без категориальных признаков
    2. num_feature_after_preproc - для обучения моделей с закодированными категориальными признаками
    3. num_features + cat_features - для моделей, в которых есть встроенная обработка категориальных признаков

In [None]:
X = train_data[num_feature]
y = train_data[target]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)