In [156]:
import pandas as pd
import numpy as np
import hashlib
from catboost import CatBoostClassifier, Pool, metrics, EShapCalcType, EFeaturesSelectionAlgorithm
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, auc, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder 
import optuna

In [157]:
features = [
 'transaction_time',
 'merch',
 'cat_id',
 'amount',
 'name_1',
 'name_2',
 'gender',
 'street',
 'one_city',
 'us_state',
 'post_code',
 'lat',
 'lon',
 'population_city',
 'jobs',
 'merchant_lat',
 'merchant_lon',
 'target',
 'amount_log',
 'population_log',
 'amount_per_person',
 'hour',
 'weekday',
 'is_night',
 'user_id',
 'user_cat_mean_amount',
 'user_cat_median_amount',
 'user_cat_std_amount',
 'user_cat_p95_amount',
 'txn_z_score_vs_user_cat',
 'txn_delta_vs_user_cat_median',
 'txn_above_user_cat_p95',
 'user_profile_mean_lat',
 'user_profile_mean_lon',
 'txn_distance_from_user_home_base',
 'user_profile_mean_cat_entertainment',
 'user_profile_mean_cat_food_dining',
 'user_profile_mean_cat_gas_transport',
 'user_profile_mean_cat_grocery_net',
 'user_profile_mean_cat_grocery_pos',
 'user_profile_mean_cat_health_fitness',
 'user_profile_mean_cat_home',
 'user_profile_mean_cat_kids_pets',
 'user_profile_mean_cat_misc_net',
 'user_profile_mean_cat_misc_pos',
 'user_profile_mean_cat_personal_care',
 'user_profile_mean_cat_shopping_net',
 'user_profile_mean_cat_shopping_pos',
 'user_profile_mean_cat_travel',
 'user_profile_median_cat_entertainment',
 'user_profile_median_cat_food_dining',
 'user_profile_median_cat_gas_transport',
 'user_profile_median_cat_grocery_net',
 'user_profile_median_cat_grocery_pos',
 'user_profile_median_cat_health_fitness',
 'user_profile_median_cat_home',
 'user_profile_median_cat_kids_pets',
 'user_profile_median_cat_misc_net',
 'user_profile_median_cat_misc_pos',
 'user_profile_median_cat_personal_care',
 'user_profile_median_cat_shopping_net',
 'user_profile_median_cat_shopping_pos',
 'user_profile_median_cat_travel',
 'user_profile_std_cat_entertainment',
 'user_profile_std_cat_food_dining',
 'user_profile_std_cat_gas_transport',
 'user_profile_std_cat_grocery_net',
 'user_profile_std_cat_grocery_pos',
 'user_profile_std_cat_health_fitness',
 'user_profile_std_cat_home',
 'user_profile_std_cat_kids_pets',
 'user_profile_std_cat_misc_net',
 'user_profile_std_cat_misc_pos',
 'user_profile_std_cat_personal_care',
 'user_profile_std_cat_shopping_net',
 'user_profile_std_cat_shopping_pos',
 'user_profile_std_cat_travel',
 'user_profile_p95_cat_entertainment',
 'user_profile_p95_cat_food_dining',
 'user_profile_p95_cat_gas_transport',
 'user_profile_p95_cat_grocery_net',
 'user_profile_p95_cat_grocery_pos',
 'user_profile_p95_cat_health_fitness',
 'user_profile_p95_cat_home',
 'user_profile_p95_cat_kids_pets',
 'user_profile_p95_cat_misc_net',
 'user_profile_p95_cat_misc_pos',
 'user_profile_p95_cat_personal_care',
 'user_profile_p95_cat_shopping_net',
 'user_profile_p95_cat_shopping_pos',
 'user_profile_p95_cat_travel',
 'user_profile_count_cat_entertainment',
 'user_profile_count_cat_food_dining',
 'user_profile_count_cat_gas_transport',
 'user_profile_count_cat_grocery_net',
 'user_profile_count_cat_grocery_pos',
 'user_profile_count_cat_health_fitness',
 'user_profile_count_cat_home',
 'user_profile_count_cat_kids_pets',
 'user_profile_count_cat_misc_net',
 'user_profile_count_cat_misc_pos',
 'user_profile_count_cat_personal_care',
 'user_profile_count_cat_shopping_net',
 'user_profile_count_cat_shopping_pos',
 'user_profile_count_cat_travel',
 'user_profile_mean_z_score',
 'user_profile_std_z_score',
 'user_profile_p95_z_score',
 'user_profile_frac_above_p95',
 'user_profile_mean_delta_from_median',
 'user_profile_std_delta_from_median',
 'user_profile_weekday_count_0',
 'user_profile_weekday_count_1',
 'user_profile_weekday_count_2',
 'user_profile_weekday_count_3',
 'user_profile_weekday_count_4',
 'user_profile_weekday_count_5',
 'user_profile_weekday_count_6',
 'user_profile_hour_count_0',
 'user_profile_hour_count_1',
 'user_profile_hour_count_2',
 'user_profile_hour_count_3',
 'user_profile_hour_count_4',
 'user_profile_hour_count_5',
 'user_profile_hour_count_6',
 'user_profile_hour_count_7',
 'user_profile_hour_count_8',
 'user_profile_hour_count_9',
 'user_profile_hour_count_10',
 'user_profile_hour_count_11',
 'user_profile_hour_count_12',
 'user_profile_hour_count_13',
 'user_profile_hour_count_14',
 'user_profile_hour_count_15',
 'user_profile_hour_count_16',
 'user_profile_hour_count_17',
 'user_profile_hour_count_18',
 'user_profile_hour_count_19',
 'user_profile_hour_count_20',
 'user_profile_hour_count_21',
 'user_profile_hour_count_22',
 'user_profile_hour_count_23',
 'user_profile_txn_count',
 'user_profile_mean_geo_distance',
 'user_profile_max_geo_distance',
 'user_profile_std_geo_distance',
 'user_profile_geo_unique_locations']

In [158]:
base_features = [
    'target',
    'amount_per_person',
    'population_log',
    'hour', 'weekday', 'is_night',
    # 'lat', 'lon', 'merchant_lat', 'merchant_lon',
    'txn_z_score_vs_user_cat', 'txn_delta_vs_user_cat_median',
    'txn_above_user_cat_p95', 'txn_distance_from_user_home_base'
]

mean_profile_features = [
    'user_profile_mean_cat_entertainment', 'user_profile_mean_cat_food_dining',
    'user_profile_mean_cat_gas_transport', 'user_profile_mean_cat_grocery_net',
    'user_profile_mean_cat_grocery_pos', 'user_profile_mean_cat_health_fitness',
    'user_profile_mean_cat_home', 'user_profile_mean_cat_kids_pets',
    'user_profile_mean_cat_misc_net', 'user_profile_mean_cat_misc_pos',
    'user_profile_mean_cat_personal_care', 'user_profile_mean_cat_shopping_net',
    'user_profile_mean_cat_shopping_pos', 'user_profile_mean_cat_travel'
]

geo_profile_features = [
    'user_profile_mean_geo_distance', 'user_profile_std_geo_distance',
    'user_profile_geo_unique_locations'
]

time_pattern_features = (
    [f'user_profile_weekday_count_{i}' for i in range(7)] +
    [f'user_profile_hour_count_{i}' for i in range(24)]
)

other_profile_features = [
    'user_profile_mean_z_score', 'user_profile_frac_above_p95',
    'user_profile_std_z_score', 'user_profile_mean_delta_from_median'
]

model_features = (
    base_features +
    mean_profile_features +
    geo_profile_features +
    time_pattern_features +
    other_profile_features
)


In [159]:
train = pd.read_csv('../data/clean_data/train_clean.csv')

In [160]:
test = pd.read_csv('../data/clean_data/test_clean.csv')

In [161]:
train.target.value_counts()

target
0    625479
1      3665
Name: count, dtype: int64

In [162]:
X = train[[col for col in model_features if col != 'target']]
y = train['target']

X_test_catboost = test[[col for col in model_features if col != 'target']]
y_test_catboost = test['target'].copy()

In [163]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 200, 2000), 
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 1, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255), 
        'random_strength': trial.suggest_float('random_strength', 1e-3, 10.0, log=True), 
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 100.0), 
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0), 
        'od_type': 'Iter', 
        'od_wait': trial.suggest_int('od_wait', 10, 50) 
    }
 
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=trial.number)
    f1_scores = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        model = CatBoostClassifier(
            **params,
            loss_function='Logloss',
            eval_metric='F1', 
            verbose=0, 
            random_seed=42,
        )

        model.fit(X_train_fold, y_train_fold,
                    eval_set=[(X_val_fold, y_val_fold)],
                    early_stopping_rounds=params['od_wait'], 
                    verbose_eval=False) 

        preds_proba_positive_class = model.predict_proba(X_val_fold)[:, 1]
        preds_labels_fold = (preds_proba_positive_class > 0.6).astype(int)
        f1_scores.append(f1_score(y_val_fold, preds_labels_fold))

    return np.mean(f1_scores)


In [164]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=200) 

best_params = study.best_params
print("\nЛучшие гиперпараметры:")
print(best_params)
# best_params = {'iterations': 770, 'learning_rate': 0.0742551431535985, 'depth': 6, 'l2_leaf_reg': 0.47163693819169705, 'border_count': 251, 'random_strength': 0.2576921365353809, 'scale_pos_weight': 2.624072463251195, 'bagging_temperature': 0.08379704346003464, 'od_wait': 46}

[I 2025-05-10 22:44:19,749] A new study created in memory with name: no-name-da0312b3-1c18-43be-bfa9-86b51dd64238


[I 2025-05-10 22:44:45,542] Trial 0 finished with value: 0.8006937177564316 and parameters: {'iterations': 874, 'learning_rate': 0.08927180304353628, 'depth': 8, 'l2_leaf_reg': 0.24810409748678114, 'border_count': 66, 'random_strength': 0.004207053950287938, 'scale_pos_weight': 1.2323344486727978, 'bagging_temperature': 0.8661761457749352, 'od_wait': 34}. Best is trial 0 with value: 0.8006937177564316.
[I 2025-05-10 22:45:06,444] Trial 1 finished with value: 0.39851658115486166 and parameters: {'iterations': 1475, 'learning_rate': 0.010485387725194618, 'depth': 10, 'l2_leaf_reg': 2.136832907235876, 'border_count': 79, 'random_strength': 0.005337032762603957, 'scale_pos_weight': 1.7336180394137353, 'bagging_temperature': 0.3042422429595377, 'od_wait': 31}. Best is trial 0 with value: 0.8006937177564316.
[I 2025-05-10 22:45:22,111] Trial 2 finished with value: 0.7188041289600479 and parameters: {'iterations': 977, 'learning_rate': 0.019553708662745254, 'depth': 7, 'l2_leaf_reg': 0.003613


Лучшие гиперпараметры:
{'iterations': 770, 'learning_rate': 0.0742551431535985, 'depth': 6, 'l2_leaf_reg': 0.47163693819169705, 'border_count': 251, 'random_strength': 0.2576921365353809, 'scale_pos_weight': 2.624072463251195, 'bagging_temperature': 0.08379704346003464, 'od_wait': 46}


In [165]:
categorical_features = train[model_features].select_dtypes(include=['object', 'category']).columns.tolist()
print(categorical_features)

[]


In [171]:
final_model = CatBoostClassifier(
    **best_params,
    loss_function='Logloss',
    eval_metric='F1',
    random_seed=42,
    verbose=100 
)

In [172]:
final_model.fit(X, y,
                eval_set=(X_test_catboost, y_test_catboost),
                early_stopping_rounds=best_params['od_wait'])

0:	learn: 0.7752755	test: 0.7068773	best: 0.7068773 (0)	total: 118ms	remaining: 1m 30s
100:	learn: 0.8748519	test: 0.7898199	best: 0.7905458 (99)	total: 5.51s	remaining: 36.5s
Stopped by overfitting detector  (46 iterations wait)

bestTest = 0.7918134277
bestIteration = 113

Shrink model to first 114 iterations.


<catboost.core.CatBoostClassifier at 0x135eba8d0>

In [173]:
print("\n--- Оценка финальной модели на тестовых данных ---")
test_preds_proba = final_model.predict_proba(X_test_catboost)[:, 1]
test_preds_labels = final_model.predict(X_test_catboost)

roc_auc_test = roc_auc_score(y_test_catboost, test_preds_proba)
f1_test = f1_score(y_test_catboost, test_preds_labels)
precision_test, recall_test, _ = precision_recall_curve(y_test_catboost, test_preds_proba)
pr_auc_test = auc(recall_test, precision_test)

print(classification_report(y_test_catboost, test_preds_labels, target_names=['Not Fraud (0)', 'Fraud (1)']))
print(f"ROC AUC на тесте: {roc_auc_test:.4f}")
print(f"F1-score на тесте: {f1_test:.4f}")
print(f"Precision-Recall AUC на тесте: {pr_auc_test:.4f}")

print("\nМатрица ошибок на тесте:")
print(confusion_matrix(y_test_catboost, test_preds_labels))


--- Оценка финальной модели на тестовых данных ---
               precision    recall  f1-score   support

Not Fraud (0)       1.00      1.00      1.00    156448
    Fraud (1)       0.60      0.79      0.68       839

     accuracy                           1.00    157287
    macro avg       0.80      0.89      0.84    157287
 weighted avg       1.00      1.00      1.00    157287

ROC AUC на тесте: 0.9864
F1-score на тесте: 0.6804
Precision-Recall AUC на тесте: 0.7111

Матрица ошибок на тесте:
[[156007    441]
 [   179    660]]
