#Несбалансированная классификация: выявление случаев мошенничества с кредитными картами

## Introduction

This example looks at the
[Kaggle Credit Card Fraud Detection](https://www.kaggle.com/mlg-ulb/creditcardfraud/)
dataset to demonstrate how
to train a classification model on data with highly imbalanced classes.

In [6]:
# импортируем основные библиотеки
import numpy as np
import pandas as pd

#Загрузим наши данные

In [11]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud?dataset_version_number=3...


100%|██████████| 66.0M/66.0M [00:02<00:00, 23.8MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/mlg-ulb/creditcardfraud/versions/3


In [12]:
!wget 'https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud?dataset_version_number=3'

--2024-12-06 07:35:50--  https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud?dataset_version_number=3
Resolving www.kaggle.com (www.kaggle.com)... 35.244.233.98
Connecting to www.kaggle.com (www.kaggle.com)|35.244.233.98|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://storage.googleapis.com:443/kaggle-data-sets/310/23498/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20241206%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20241206T073550Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=674d9ef0ec5f105cb38851a2baf62e9bf925526a1fa256d7d368b4c0df9ed6b012ff9387d0b155de742c99624964d9e48abad785f1d9d4cb5b81fe92eec46549ab8bd92922a69729a87f4a6a8275451785b010e647349f4c8610d844885d5ef07cb6f1a01359d8fba1d1fc9eff0ebef764590bf7f7791da02466c20461389476ec8bfca5037fd7a52debf1f8c5ff5ae7eaf66a59ae3cf9dd29350e9266c0d69fc0b8c7d97320064f2beaae59846f052889

In [14]:
import zipfile

# Путь к zip-архиву
archive_path = '/content/creditcardfraud?dataset_version_number=3'

# Распаковываем zip-архив
with zipfile.ZipFile(archive_path, 'r') as zip_ref:
    zip_ref.extractall()

In [15]:
# загружаю датасет
path_file = '/content/creditcard.csv'
# загружаю данные
df = pd.read_csv(path_file)
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [20]:
# проверим на пропуски
df[df.isna().any(axis=1)]

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class


In [23]:
# проверим на пропуски
df.isna().sum().sum()

0

#Подготовьте набор для проверки

In [16]:
# подготовим данные для обучения
from sklearn.model_selection import train_test_split

X = df.drop(columns='Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, random_state=42
                                                    )

In [19]:
print(f'Number of training samples: {X_train.shape[0]+1}')
print(f'Number of validation samples: {X_test.shape[0]+1}')

Number of training samples: 227846
Number of validation samples: 56963


#Проведем анализ дисбаланса в целевых группах

In [29]:
counts = np.bincount(y_train)
counts[0]

227451

In [28]:
y_train.shape[0]

227845

In [35]:
# посмотрим на соотношение проложительных и отрицательных транзакций
counts = np.bincount(y_train)
print(f'Всего транзакций: {y_train.shape[0]}. Из них:')
print(f'Реальных: {counts[0]}. Мошеннических: {counts[1]}, ({(counts[1]/counts[0])*100 :0.2f}%)')

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

Всего транзакций: 227845. Из них:
Реальных: 227451. Мошеннических: 394, (0.17%)


In [39]:
# настроим веса для балансировки модели
weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]
print(f'weight_for_0={weight_for_0}, weight_for_1 ={weight_for_1}')
print(f'справочно: weight_for_1/weight_for_0= {weight_for_1/weight_for_0 :0.2f}')

weight_for_0=4.396551345124884e-06, weight_for_1 =0.0025380710659898475
справочно: weight_for_1/weight_for_0= 577.29


#Нормализуем данные и проверим нормализацию

In [40]:
# данные ло нормализации
X_train.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
223361,143352.0,1.955041,-0.380783,-0.315013,0.330155,-0.509374,-0.086197,-0.627978,0.035994,1.05456,...,-0.12539,0.238197,0.968305,0.053208,-0.278602,-0.044999,-0.21678,0.045168,-0.047145,9.99
165061,117173.0,-0.400975,-0.626943,1.555339,-2.017772,-0.107769,0.16831,0.017959,-0.401619,0.040378,...,-0.470372,-0.153485,0.421703,0.113442,-1.004095,-1.176695,0.361924,-0.370469,-0.144792,45.9
238186,149565.0,0.072509,0.820566,-0.561351,-0.709897,1.080399,-0.359429,0.787858,0.117276,-0.131275,...,0.012227,-0.314638,-0.872959,0.083391,0.148178,-0.431459,0.11969,0.206395,0.070288,11.99
150562,93670.0,-0.535045,1.014587,1.750679,2.76939,0.500089,1.00227,0.847902,-0.081323,0.371579,...,-0.253757,0.063525,0.443431,-0.072754,0.448192,-0.655203,-0.181038,-0.093013,-0.064931,117.44
138452,82655.0,-4.026938,1.897371,-0.429786,-0.029571,-0.855751,-0.480406,-0.435632,1.31376,0.536044,...,-0.01232,-0.480691,-0.230369,0.250717,0.066399,0.470787,0.245335,0.286904,-0.322672,25.76


In [None]:
# нормализуем

In [None]:
print('Before normalization: ', train_features[2])
mean = np.mean(train_features, axis=0)
train_features -= mean
val_features -= mean

std = np.std(train_features, axis=0)
train_features /= std
val_features /= std
print('After normalization: ', train_features[2])

Before normalization:  [ 1.0000000e+00 -1.3583541e+00 -1.3401631e+00  1.7732093e+00
  3.7977961e-01 -5.0319815e-01  1.8004994e+00  7.9146093e-01
  2.4767579e-01 -1.5146543e+00  2.0764287e-01  6.2450147e-01
  6.6083685e-02  7.1729273e-01 -1.6594592e-01  2.3458650e+00
 -2.8900833e+00  1.1099694e+00 -1.2135931e-01 -2.2618570e+00
  5.2497971e-01  2.4799815e-01  7.7167940e-01  9.0941226e-01
 -6.8928093e-01 -3.2764184e-01 -1.3909657e-01 -5.5352796e-02
 -5.9751842e-02  3.7866000e+02]
After normalization:  [-2.000831   -0.6643839  -0.800215    1.0673089   0.23807637 -0.32006603
  1.3394924   0.6662744   0.20148349 -1.3502778   0.19138344  0.5304687
  0.10520667  0.68714315 -0.20587935  2.4593623  -3.2572083   1.252478
 -0.11686509 -2.7495122   0.6611577   0.34202614  1.1212966   1.4581116
 -1.1393514  -0.72019994 -0.29163548 -0.13880983 -0.18471171  1.1489743 ]


In [None]:
train_features.mean(axis=0)

array([ 1.19692659e-05,  4.37138226e-07, -3.57809427e-08,  3.00485590e-06,
       -8.66645451e-07, -6.25757195e-07,  4.03243689e-07, -4.61278717e-07,
        1.01057346e-07,  5.14432301e-08,  1.14444694e-08,  1.28217948e-06,
       -5.83659698e-07,  2.37660203e-07, -2.49136065e-07,  8.00532291e-07,
        1.66812217e-08,  1.94149749e-07,  1.51190221e-07,  9.69690532e-08,
        3.50928673e-07,  1.09110331e-07, -2.78042961e-07, -2.10823060e-07,
        7.19980386e-09,  2.20549555e-06,  4.22972818e-07,  3.48672025e-08,
        7.06674719e-09,  2.68111944e-05], dtype=float32)

In [None]:
train_features.std(axis=0)

array([1.0000184 , 1.0000031 , 0.99999654, 0.9999961 , 1.0000012 ,
       1.0000044 , 1.0000013 , 1.0000077 , 1.0000154 , 0.9999921 ,
       1.0000087 , 0.999995  , 1.0000061 , 1.000002  , 1.0000014 ,
       1.0000093 , 1.0000136 , 1.0000196 , 1.0000007 , 1.0000111 ,
       0.9999998 , 1.0000029 , 0.9999994 , 1.0000079 , 1.0000101 ,
       1.0000007 , 1.0000094 , 1.0000167 , 0.9999909 , 0.99979484],
      dtype=float32)

## Build a binary classification model

In [None]:
from tensorflow import keras

hid_size = 256
model = keras.Sequential(
    [
        keras.layers.Dense(
            hid_size, activation="relu", input_shape=(train_features.shape[-1],)
        ), # fully-connected y^1
        keras.layers.Dense(hid_size*2, activation="relu"), # y^2
        keras.layers.Dropout(0.3),
        keras.layers.Dense(hid_size, activation="relu"), # y^3
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"), # y^4
    ]
)
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## Train the model with `class_weight` argument

In [None]:
metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics
)

callbacks = [keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.keras")]
class_weight = {0: weight_for_0, 1: weight_for_1}

model.fit(
    train_features,
    train_targets,
    batch_size=2048,
    epochs=30,
    callbacks=callbacks,
    validation_data=(val_features, val_targets),
    class_weight=class_weight,
)


Epoch 1/30
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 57ms/step - fn: 28.5310 - fp: 20892.7793 - loss: 3.4427e-06 - precision: 0.0087 - recall: 0.8617 - tn: 95570.1797 - tp: 199.3097 - val_fn: 10.0000 - val_fp: 874.0000 - val_loss: 0.0639 - val_precision: 0.0692 - val_recall: 0.8667 - val_tn: 56012.0000 - val_tp: 65.0000
Epoch 2/30
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - fn: 14.4248 - fp: 3151.8672 - loss: 1.2898e-06 - precision: 0.0689 - recall: 0.9358 - tn: 113326.2656 - tp: 198.2389 - val_fn: 8.0000 - val_fp: 1219.0000 - val_loss: 0.1124 - val_precision: 0.0521 - val_recall: 0.8933 - val_tn: 55667.0000 - val_tp: 67.0000
Epoch 3/30
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - fn: 13.8142 - fp: 3407.5752 - loss: 1.0343e-06 - precision: 0.0705 - recall: 0.9483 - tn: 113059.2891 - tp: 210.1151 - val_fn: 12.0000 - val_fp: 216.0000 - val_loss: 0.0190 - val_precision: 0.2258 - val_recall: 0.8400 - 

<keras.src.callbacks.history.History at 0x7d85517f36d0>

In [None]:
#    P(1)   N(0)
# T
# F

In [None]:
# n_samples ~ 300000
# batch_size ~ 3000
# n_steps ~ 100
# n_epochs ~ 1

## Conclusions

At the end of training, out of 56,961 validation transactions, we are:

- Correctly identifying 66 of them as fraudulent
- Missing 9 fraudulent transactions
- At the cost of incorrectly flagging 441 legitimate transactions

In the real world, one would put an even higher weight on class 1,
so as to reflect that False Negatives are more costly than False Positives.

Next time your credit card gets  declined in an online purchase -- this is why.