#Несбалансированная классификация: выявление случаев мошенничества с кредитными картами

## Introduction

This example looks at the
[Kaggle Credit Card Fraud Detection](https://www.kaggle.com/mlg-ulb/creditcardfraud/)
dataset to demonstrate how
to train a classification model on data with highly imbalanced classes.

In [1]:
# импортируем основные библиотеки
import numpy as np
import pandas as pd

#Загрузим наши данные

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud?dataset_version_number=3...


100%|██████████| 66.0M/66.0M [00:01<00:00, 64.5MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/mlg-ulb/creditcardfraud/versions/3


In [3]:
!wget 'https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud?dataset_version_number=3'

--2024-12-06 16:54:58--  https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud?dataset_version_number=3
Resolving www.kaggle.com (www.kaggle.com)... 35.244.233.98
Connecting to www.kaggle.com (www.kaggle.com)|35.244.233.98|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://storage.googleapis.com:443/kaggle-data-sets/310/23498/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20241206%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20241206T165458Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=5363ec456fdefb2cb926e8e831bdb09a041f6c6f5c10e2842dd86edacdd2fe5fed21a34bb72c1b6546623be1afbc59525a553a04cd5e1f1fc86d1c39d7cb56d1e76b530cbefc2867dcc558a38b72be5cf3690aead2c63f7052f17df80d8f3d072a7a1dd565226be5fc86b330a7834a44f3ec929e575421ff9776ac7ecd0e175f857025f0fe226403a6aae5e93d463044664a6ab38710e26eb2f0323efec25cca3ffcb5eb7c1823be375842abb0143c6733

In [4]:
import zipfile

# Путь к zip-архиву
archive_path = '/content/creditcardfraud?dataset_version_number=3'

# Распаковываем zip-архив
with zipfile.ZipFile(archive_path, 'r') as zip_ref:
    zip_ref.extractall()

In [5]:
# загружаю датасет
path_file = '/content/creditcard.csv'
# загружаю данные
df = pd.read_csv(path_file)
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
# проверим на пропуски
df[df.isna().any(axis=1)]

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class


In [7]:
# проверим на пропуски
df.isna().sum().sum()

0

#Подготовим набор для обучения

In [8]:
# подготовим данные для обучения
from sklearn.model_selection import train_test_split

X = df.drop(columns='Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, random_state=42
                                                    )

In [9]:
print(f'Number of training samples: {X_train.shape[0]+1}')
print(f'Number of validation samples: {X_test.shape[0]+1}')

Number of training samples: 227846
Number of validation samples: 56963


#Проведем анализ дисбаланса в целевых группах

In [10]:
counts = np.bincount(y_train)
counts[0]

227451

In [11]:
y_train.shape[0]

227845

In [12]:
# посмотрим на соотношение проложительных и отрицательных транзакций
counts = np.bincount(y_train)
print(f'Всего транзакций: {y_train.shape[0]}. Из них:')
print(f'Реальных: {counts[0]}. Мошеннических: {counts[1]}, ({(counts[1]/counts[0])*100 :0.2f}%)')

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

Всего транзакций: 227845. Из них:
Реальных: 227451. Мошеннических: 394, (0.17%)


In [13]:
# настроим веса для балансировки модели
weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]
print(f'weight_for_0={weight_for_0}, weight_for_1 ={weight_for_1}')
print(f'справочно: weight_for_1/weight_for_0= {weight_for_1/weight_for_0 :0.2f}')

weight_for_0=4.396551345124884e-06, weight_for_1 =0.0025380710659898475
справочно: weight_for_1/weight_for_0= 577.29


#Нормализуем данные и проверим нормализацию

In [14]:
# данные ло нормализации
X_train.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
223361,143352.0,1.955041,-0.380783,-0.315013,0.330155,-0.509374,-0.086197,-0.627978,0.035994,1.05456,...,-0.12539,0.238197,0.968305,0.053208,-0.278602,-0.044999,-0.21678,0.045168,-0.047145,9.99
165061,117173.0,-0.400975,-0.626943,1.555339,-2.017772,-0.107769,0.16831,0.017959,-0.401619,0.040378,...,-0.470372,-0.153485,0.421703,0.113442,-1.004095,-1.176695,0.361924,-0.370469,-0.144792,45.9
238186,149565.0,0.072509,0.820566,-0.561351,-0.709897,1.080399,-0.359429,0.787858,0.117276,-0.131275,...,0.012227,-0.314638,-0.872959,0.083391,0.148178,-0.431459,0.11969,0.206395,0.070288,11.99
150562,93670.0,-0.535045,1.014587,1.750679,2.76939,0.500089,1.00227,0.847902,-0.081323,0.371579,...,-0.253757,0.063525,0.443431,-0.072754,0.448192,-0.655203,-0.181038,-0.093013,-0.064931,117.44
138452,82655.0,-4.026938,1.897371,-0.429786,-0.029571,-0.855751,-0.480406,-0.435632,1.31376,0.536044,...,-0.01232,-0.480691,-0.230369,0.250717,0.066399,0.470787,0.245335,0.286904,-0.322672,25.76


In [None]:
mean = np.mean(train_features, axis=0)
train_features -= mean
val_features -= mean

std = np.std(train_features, axis=0)
train_features /= std
val_features /= std

In [39]:
# нормализуем
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_features = scaler.fit_transform(X_train)
val_features = scaler.fit_transform(X_test)
#X_train = pd.DataFrame(StandardScaler().fit_transform(X_train), columns=X_train.columns)
#X_test = pd.DataFrame(StandardScaler().fit_transform(X_test), columns=X_test.columns)
train_features

array([[ 1.02255459,  0.99785119, -0.22962626, ...,  0.11248883,
        -0.14374055, -0.30788875],
       [ 0.47128275, -0.205221  , -0.37821992, ..., -0.92189789,
        -0.43984143, -0.1670264 ],
       [ 1.15338663,  0.03655821,  0.49556347, ...,  0.51372993,
         0.21235767, -0.30004345],
       ...,
       [-0.31581527, -0.07533181,  0.59962034, ..., -0.3013968 ,
        -0.59571596, -0.33162078],
       [-0.1444891 , -1.50615534,  1.42172842, ...,  1.23673372,
         1.01755287, -0.34315338],
       [-0.38770656,  0.62923844, -0.47354037, ...,  0.00310728,
         0.11623211,  0.0961834 ]])

In [25]:
# проверим
train_features.mean(axis=0)

array([-9.48034812e-18,  1.25988837e-17, -2.93142343e-18,  9.91694310e-18,
        2.12060418e-18,  1.19128059e-17,  1.40334100e-17,  1.18504351e-18,
       -4.67780335e-18, -9.01256778e-18, -1.49689707e-18,  1.39086686e-17,
        7.48448536e-19,  1.19751766e-17,  4.50628389e-18, -1.22558448e-17,
       -4.98965690e-19, -5.50421527e-18, -5.36388117e-18,  8.79427029e-18,
       -5.34828849e-18, -9.99490649e-18,  6.23707113e-18,  3.46157448e-18,
        5.86284686e-18, -2.74431130e-18,  1.33473322e-17,  3.43038912e-18,
        3.43038912e-18, -9.35560669e-18])

In [27]:
train_features.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [32]:
train_features.shape[1]

30

#Построим модель бинарной классификации

In [19]:
from tensorflow import keras

In [40]:
# построим модель
hid_size = 128
model = keras.Sequential(
    [
        keras.layers.Dense(hid_size, activation='relu', input_shape=(train_features.shape[1],)),
        keras.layers.Dense(hid_size*2, activation='relu'),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(hid_size, activation='relu'),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation='sigmoid')
    ]
)
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


#Обучим модель с `class_weight` аргументом

In [41]:
metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), loss='binary_crossentropy', metrics=metrics
)

callbacks = [keras.callbacks.ModelCheckpoint('fraud_model_at_epoch_{epoch}.keras')]
class_weight = {0: weight_for_0, 1: weight_for_1}

model.fit(
    train_features,
    y_train.values,
    batch_size=2048,
    epochs=10,
    callbacks=callbacks,
    validation_data=(val_features, y_test.values),
    class_weight=class_weight,
)


Epoch 1/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 38ms/step - fn: 26.3717 - fp: 16636.7070 - loss: 3.0651e-06 - precision: 0.0099 - recall: 0.8726 - tn: 99845.4531 - tp: 182.2478 - val_fn: 11.0000 - val_fp: 1103.0000 - val_loss: 0.0783 - val_precision: 0.0731 - val_recall: 0.8878 - val_tn: 55761.0000 - val_tp: 87.0000
Epoch 2/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 49ms/step - fn: 17.9381 - fp: 3521.7700 - loss: 1.4420e-06 - precision: 0.0674 - recall: 0.9204 - tn: 112957.2656 - tp: 193.8053 - val_fn: 3.0000 - val_fp: 5126.0000 - val_loss: 0.2331 - val_precision: 0.0182 - val_recall: 0.9694 - val_tn: 51738.0000 - val_tp: 95.0000
Epoch 3/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 32ms/step - fn: 10.0000 - fp: 4403.3276 - loss: 1.1767e-06 - precision: 0.0386 - recall: 0.9571 - tn: 112089.4609 - tp: 187.9911 - val_fn: 7.0000 - val_fp: 2712.0000 - val_loss: 0.1276 - val_precision: 0.0325 - val_recall: 0.9286 

<keras.src.callbacks.history.History at 0x7dbdeff02740>

In [None]:
#    P(1)   N(0)
# T
# F

In [None]:
# n_samples ~ 300000
# batch_size ~ 3000
# n_steps ~ 100
# n_epochs ~ 1

## Conclusions

At the end of training, out of 56,961 validation transactions, we are:

- Correctly identifying 66 of them as fraudulent
- Missing 9 fraudulent transactions
- At the cost of incorrectly flagging 441 legitimate transactions

In the real world, one would put an even higher weight on class 1,
so as to reflect that False Negatives are more costly than False Positives.

Next time your credit card gets  declined in an online purchase -- this is why.