In [None]:
# !pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp312-cp312-win_amd64.whl.metadata (4.6 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Using cached flatbuffers-25.9.23-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Using cached gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Using cached libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt_einsum>=2.3.2 (from tensorflow)
  Using cached opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting protobuf>=5.28.0 (from tensorflow)
  Using cached proto

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.37.1 requires protobuf<6,>=3.20, but you have protobuf 6.33.0 which is incompatible.


In [117]:
"""
Preprocessing utilities for fraud detection
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


def load_data(csv_path='dataset/transactions.csv'):
    """Load and perform initial data cleaning"""
    df = pd.read_csv(csv_path)
    print('Loaded dataset with shape:', df.shape)
    
    if csv_path == '../dataset/resampled_data.csv':
        df = df.dropna()
        df['target'] = df['target'].map({False: 0, True: 1})
        df['isFraud'] = df['target']
        df.drop(["enteredCVV", "creditLimit", "noacqCountry", 
                 "acqCountry_CAN", "acqCountry_MEX", "acqCountry_PR",
                 "acqCountry_US", "target"], 
                 axis=1, inplace=True)
        return df

    # Drop all null columns
    columns_to_drop = [
        "Unnamed: 0", "enteredCVV", "creditLimit", 
        "acqCountry","customerId", "echoBuffer", 
        "merchantCity", "merchantState", "merchantZip", 
        "posOnPremises", "recurringAuthInd"
    ]
    df = df.drop(columns_to_drop, axis=1)
    
    return df


def one_hot_encode_categorical(df):
    """Apply one-hot encoding to categorical columns"""
    print('Starting one-hot encoding...\n')
    
    columns_with_nulls = ['acqCountry', 'merchantCountryCode', 'transactionType']
    columns_without_nulls = ['merchantCategoryCode']
    all_encode_columns = columns_with_nulls + columns_without_nulls
    
    # Handle columns with nulls - create indicator columns
    for col in columns_with_nulls:
        if col in df.columns:
            null_indicator_col = f'no{col}'
            df[null_indicator_col] = df[col].isnull().astype(int)
            df[col] = df[col].fillna('MISSING')
    
    # Perform one-hot encoding
    encoded_dfs = []
    for col in all_encode_columns:
        if col in df.columns:
            one_hot = pd.get_dummies(df[col], prefix=col, drop_first=False)
            
            if col in columns_with_nulls:
                missing_col_name = f'{col}_MISSING'
                if missing_col_name in one_hot.columns:
                    one_hot = one_hot.drop(columns=[missing_col_name])
            
            encoded_dfs.append(one_hot)
            df = df.drop(columns=[col])
    
    if encoded_dfs:
        df = pd.concat([df] + encoded_dfs, axis=1)
    
    print(f'Encoding complete! New shape: {df.shape}\n')
    return df


def convert_dates_to_numeric(df):
    """Convert date columns to days difference"""
    print('Converting date columns to numeric features...\n')
    
    date_columns = {
        'currentExpDate': 'daysToCurrentExpDate',
        'accountOpenDate': 'daysSinceAccountOpen',
        'dateOfLastAddressChange': 'daysSinceLastAddressChange'
    }
    
    df['transactionDateTime'] = pd.to_datetime(df['transactionDateTime'], errors='coerce')
    
    for original_col, new_col in date_columns.items():
        if original_col in df.columns:
            df[original_col] = pd.to_datetime(df[original_col], errors='coerce')
            df[new_col] = (df['transactionDateTime'] - df[original_col]).dt.days
            
            if new_col == "daysToCurrentExpDate":
                df[new_col] = -df[new_col]
            
            df = df.drop(columns=[original_col])
    
    df.drop(['transactionDateTime'], axis=1, inplace=True)
    print('Date conversion complete!\n')
    return df


def ordinal_encode_merchant(df):
    """Apply ordinal encoding to merchantName based on fraud probability"""
    print('Applying ordinal encoding to merchantName...\n')
    
    if 'merchantName' not in df.columns:
        print('merchantName column not found - skipping')
        return df
    
    merchant_stats = df.groupby('merchantName').agg({
        'isFraud': ['sum', 'count']
    }).reset_index()
    
    merchant_stats.columns = ['merchantName', 'fraud_count', 'total_count']
    merchant_stats['prob_fraud'] = merchant_stats['fraud_count'] / merchant_stats['total_count']
    merchant_stats['score'] = merchant_stats['prob_fraud']
    merchant_stats = merchant_stats.sort_values('score', ascending=True).reset_index(drop=True)
    merchant_stats['ordinal_rank'] = range(len(merchant_stats))
    
    merchant_to_rank = dict(zip(merchant_stats['merchantName'], merchant_stats['ordinal_rank']))
    df['merchantName_ordinal'] = df['merchantName'].map(merchant_to_rank)
    
    unmapped_count = df['merchantName_ordinal'].isnull().sum()
    if unmapped_count > 0:
        median_rank = merchant_stats['ordinal_rank'].median()
        df['merchantName_ordinal'].fillna(median_rank, inplace=True)
    
    df = df.drop(columns=['merchantName'])
    print(f'Ordinal encoding complete! Total merchants: {len(merchant_stats)}\n')
    return df


def prepare_train_test_split(df, test_size=0.2, random_state=42):
    """Prepare X, y and create stratified train/test split"""
    if 'isFraud' not in df.columns:
        raise KeyError("Column 'isFraud' not found in dataframe")
    
    y = df['isFraud']
    X = df.drop(columns=['isFraud'])
    
    print(f'X shape: {X.shape}')
    print(f'y shape: {y.shape}')
    print(f'Fraud rate: {y.mean():.4f}\n')
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )
    
    print(f'Train shapes -> X: {X_train.shape}, y: {y_train.shape}')
    print(f'Test shapes  -> X: {X_test.shape}, y: {y_test.shape}')
    print(f'Train fraud rate: {y_train.mean():.4f}')
    print(f'Test fraud rate: {y_test.mean():.4f}\n')
    
    return X_train, X_test, y_train, y_test


def preprocess_pipeline(csv_path='../dataset/transactions.csv'):
    """Full preprocessing pipeline"""
    print('='*60)
    print('STARTING PREPROCESSING PIPELINE')
    print('='*60 + '\n')
    
    # Load data
    df = load_data(csv_path)

    if csv_path != '../dataset/resampled_data.csv':
        # One-hot encoding
        df = one_hot_encode_categorical(df)
        
        # Date conversion
        df = convert_dates_to_numeric(df)
        
        # Merchant encoding
        df = ordinal_encode_merchant(df)
    
    # 2. Preprocesare (scalare)
    # scaler = StandardScaler().set_output(transform="pandas")
    # df_scaled = scaler.fit_transform(df)

    df = df.dropna()

    df_scaled = df / df.max()
    df_scaled = df_scaled.apply(pd.to_numeric, errors='coerce')
    
    # Train/test split
    X_train, X_test, y_train, y_test = prepare_train_test_split(df_scaled)

    print('='*60)
    print('PREPROCESSING COMPLETE')
    print('='*60 + '\n')
    
    return X_train, X_test, y_train, y_test, df


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, roc_auc_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

X_train, X_test, y_train, y_test, df = preprocess_pipeline('../dataset/transactions.csv')
aux = y_test.copy()

STARTING PREPROCESSING PIPELINE

Loaded dataset with shape: (786363, 30)
Starting one-hot encoding...

Encoding complete! New shape: (786363, 44)

Converting date columns to numeric features...



  df[original_col] = pd.to_datetime(df[original_col], errors='coerce')


Date conversion complete!

Applying ordinal encoding to merchantName...

Ordinal encoding complete! Total merchants: 2490

X shape: (781903, 42)
y shape: (781903,)
Fraud rate: 0.0155

Train shapes -> X: (625522, 42), y: (625522,)
Test shapes  -> X: (156381, 42), y: (156381,)
Train fraud rate: 0.0155
Test fraud rate: 0.0155

PREPROCESSING COMPLETE

<class 'pandas.core.frame.DataFrame'>
Index: 625522 entries, 509005 to 259779
Data columns (total 42 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   accountNumber                              625522 non-null  float64
 1   availableMoney                             625522 non-null  float64
 2   transactionAmount                          625522 non-null  float64
 3   posEntryMode                               625522 non-null  float64
 4   posConditionCode                           625522 non-null  float64
 5   cardCVV                

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# from scipy.stats import pointbiserialr

# df['isFraud'] = df['isFraud'].astype('int64')
# numerical_features = df.select_dtypes(include=['float64', 'int64']).columns.drop('isFraud')

# # 1. Plot istograme pentru fiecare feature numeric, diferențiat pe fraud / non-fraud:
# for col in numerical_features:
#     plt.figure(figsize=(8, 4))
#     sns.histplot(data=df, x=col, hue="isFraud", bins=50, kde=True, stat="density", element="step")
#     plt.title(f'Distribuție {col} pe fraude vs non-fraude')
#     plt.show()

# # 2. Heatmap corelații între variabile numerice și target
# correlations = {}
# for col in numerical_features:
#     corr, _ = pointbiserialr(df[col].fillna(0), df['isFraud'])
#     correlations[col] = corr

# corr_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['corr_with_target']).sort_values(by='corr_with_target', ascending=False)
# plt.figure(figsize=(6, len(corr_df)//2))
# sns.heatmap(corr_df, annot=True, cmap='coolwarm')
# plt.title('Corelația fiecărei variabile cu targetul')
# plt.show()

In [121]:

# def soft_f1_loss(y_true, y_pred, epsilon=1e-7):
#     y_pred = tf.clip_by_value(y_pred, epsilon, 1 - epsilon)
#     tp = tf.reduce_sum(y_true * y_pred)
#     fp = tf.reduce_sum((1 - y_true) * y_pred)
#     fn = tf.reduce_sum(y_true * (1 - y_pred))
#     soft_f1 = 2 * tp / (2 * tp + fp + fn + epsilon)
#     return 1 - soft_f1  # Vrem să maximizăm F1, deci minimizăm 1-F1
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.35),
    layers.Dense(16, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.15),
    layers.Dense(1, activation='sigmoid')
])
optimizer=keras.optimizers.Adam(learning_rate=0.0005)
model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=['Recall', 'Precision', keras.metrics.AUC()]
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=128,
    epochs=10,
)
# Predict with custom threshold
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba >= 0.6).astype(int)


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4887/4887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - Precision: 0.0166 - Recall: 0.0234 - auc_30: 0.5650 - loss: 0.1332 - val_Precision: 0.0000e+00 - val_Recall: 0.0000e+00 - val_auc_30: 0.7125 - val_loss: 0.0750
Epoch 2/10
[1m4887/4887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - Precision: 0.0000e+00 - Recall: 0.0000e+00 - auc_30: 0.6878 - loss: 0.0764 - val_Precision: 0.0000e+00 - val_Recall: 0.0000e+00 - val_auc_30: 0.7563 - val_loss: 0.0729
Epoch 3/10
[1m4887/4887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - Precision: 0.5000 - Recall: 1.0307e-04 - auc_30: 0.7210 - loss: 0.0749 - val_Precision: 0.0000e+00 - val_Recall: 0.0000e+00 - val_auc_30: 0.7574 - val_loss: 0.0726
Epoch 4/10
[1m4887/4887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - Precision: 0.0000e+00 - Recall: 0.0000e+00 - auc_30: 0.7307 - loss: 0.0743 - val_Precision: 0.0000e+00 - val_Recall: 0.0000e+00 - val_auc_30: 0.7618 - val_los

In [122]:
from sklearn.metrics import classification_report

print(y_test.isna().sum())
print(y_test)

print(y_test.shape)
print(y_pred.shape)

print(classification_report(y_test, y_pred))

print("F1-score:", f1_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

0
255354    0.0
223774    0.0
512020    0.0
441847    0.0
308933    0.0
         ... 
174404    0.0
636704    0.0
15828     0.0
684132    0.0
118390    0.0
Name: isFraud, Length: 156381, dtype: float64
(156381,)
(156381, 1)
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99    153956
         1.0       0.00      0.00      0.00      2425

    accuracy                           0.98    156381
   macro avg       0.49      0.50      0.50    156381
weighted avg       0.97      0.98      0.98    156381

F1-score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall: 0.0
Precision: 0.0
Accuracy: 0.9844930010679047
AUC: 0.7823672850162304


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [79]:
X_train, X_test, y_train, y_test, df = preprocess_pipeline('../dataset/transactions.csv')

y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba >= 0.5).astype(int)

print(classification_report(y_test, y_pred))

print("F1-score:", f1_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

STARTING PREPROCESSING PIPELINE

Loaded dataset with shape: (786363, 30)
Starting one-hot encoding...

Encoding complete! New shape: (786363, 44)

Converting date columns to numeric features...



  df[original_col] = pd.to_datetime(df[original_col], errors='coerce')


Date conversion complete!

Applying ordinal encoding to merchantName...

Ordinal encoding complete! Total merchants: 2490

X shape: (786363, 42)
y shape: (786363,)
Fraud rate: 0.0158

Train shapes -> X: (629090, 42), y: (629090,)
Test shapes  -> X: (157273, 42), y: (157273,)
Train fraud rate: 0.0158
Test fraud rate: 0.0158

PREPROCESSING COMPLETE

[1m4915/4915[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 505us/step
              precision    recall  f1-score   support

       False       0.00      0.00      0.00    154790
        True       0.02      1.00      0.03      2483

    accuracy                           0.02    157273
   macro avg       0.01      0.50      0.02    157273
weighted avg       0.00      0.02      0.00    157273

F1-score: 0.03108490447933098
Recall: 1.0
Precision: 0.015787833893929664
Accuracy: 0.015787833893929664
AUC: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
