# Case Study: Real-Time Credit Card Fraud Detection

## Background: real-time feature engineering with DBSP

**Feature engineering** is the process of transforming raw data into a set of features that can be used to improve the predictive accuracy of an ML model.  Features are often expressed as SQL queries that filter, transform, aggregate, and join raw data.  These queries are evaluated by an RDBMS, e.g., Postgres, and the results are fed to the ML model during training and inference stages.

**Real-time feature engineering** arises in applications where data arrives continuously and requires immediate analysis, such as fraud detection and anomaly detection.  The main challenge in this process is extracting features from constantly changing data.  Although simple cases can be handled by streaming analytics platforms like Flink, they fall short when it comes to complex SQL queries that feature engineers commonly use (we will see examples of such queries in this case study!).  A common workaround is to precompute features through periodic batch jobs in an RDBMS such as BigQuery and inject the precomputed features into the real-time data stream.  This approach allows arbitrary feauture queries but sacrifices **feature freshness**, resulting in poor prediction accuracy in many real-time ML applications since precomputed features do not reflect the latest input data.

**DBSP aims to provide the benefits of both worlds** by evaluating complex feature queries directly on streaming data, eliminating the need for batch jobs and delivering perfect feature freshness.

## About this case study

Our goal in this case study is two-fold:

1. To illustrate how ML engineers can invoke DBSP from a Jupyter notebook to evaluate feature extraction queries on streaming data during model training, testing, and inference.
1. To empirically prove that DBSP enhances prediction accuracy in real-time ML.  Specifically, we demonstrate that **both complex queries and data freshness are critical for achieving high accuracy in real-time ML applications**.

This case study is based on the credit card fraud detection solution published by the Google Cloud blog:
https://cloud.google.com/blog/products/data-analytics/how-to-build-a-fraud-detection-solution

In [None]:
import os
import tarfile
import gdown
from os import path
import pandas as pd
from datetime import datetime
import geopy
import geopy.distance
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve, roc_curve
from collections import Counter
from sklearn.utils import shuffle
import warnings
warnings.filterwarnings('ignore')

## Download datasets from Google Drive

In [None]:
def download_dataset(fileid: str, filepath: str):
    if path.isfile(filepath):
        print("Found dataset " + filepath)
    else:
        print("Downloading dataset " + filepath)
        gdown.download("https://drive.google.com/uc?id=" + fileid, filepath + ".tar.gz", quiet=False)
        archive = tarfile.open(filepath + ".tar.gz")
        archive.extractall(path = "fraud_data")
    

if 'IN_CI' in os.environ:
    train_path           = path.abspath("fraud_data/train_ci.csv")
    test_path            = path.abspath("fraud_data/test_ci.csv")
    simulation_path      = path.abspath("fraud_data/simulation_ci.csv")

    download_dataset("1L-61nquAtBWUpwYcyqn31PyfL_IMh5DF", train_path)
    download_dataset("1RUm_24nklQtDPNdPstoviptPvSh22FAj", test_path)
    download_dataset("1n94Tll7yY0jrASh8O1-zYRwkylZNkrvf", simulation_path)
else:
    train_path           = path.abspath("fraud_data/train.csv")
    test_path            = path.abspath("fraud_data/test.csv")
    simulation_path      = path.abspath("fraud_data/simulation_short.csv")

    download_dataset("1pFyoCc1LFnnszA5MknRtoF7saI2GKUyo", train_path)
    download_dataset("1u4yC8ypGmCWUI3LhWIW_bynTEXJEWJhI", test_path)
    download_dataset("1zTPlCFdkl1slvFSQRf4VkqJQx5_-zXNR", simulation_path)

train_outpath        = path.abspath("fraud_data/train_output.csv")
test_outpath         = path.abspath("fraud_data/test_output.csv")
demographics_path    = path.abspath("fraud_data/demographics.csv")
simulation_outpath   = path.abspath("fraud_data/simulation_output.csv")
outputdir            = path.abspath("fraud_data/")


train_sample = 10000
test_sample  = 5000
transaction_catagorical_columns = ["category", "merchant"]
demographics_catagorical_columns = ["first", "gender","state", "street","city", "job"]
features_names= ['day', 'age', 'distance', 'category', 'amt', 'state', 'job', 'unix_time',  'city_pop', 'merchant', 'is_fraud']

env: IN_CI=1
Found dataset /home/leonid/projects/database-stream-processor/demo/demo_notebooks/fraud_data/train_ci.csv
Found dataset /home/leonid/projects/database-stream-processor/demo/demo_notebooks/fraud_data/test_ci.csv
Found dataset /home/leonid/projects/database-stream-processor/demo/demo_notebooks/fraud_data/simulation_ci.csv


## Prepare a feature extraction query

In [None]:
from dbsp import DBSPConnection

dbsp = DBSPConnection("http://localhost:8080")
project = dbsp.create_or_replace_program(name = "fraud_feature_query", sql_code = """
CREATE TABLE demographics (
    cc_num FLOAT64 NOT NULL,
    first STRING,
    gender STRING,
    street STRING,
    city STRING,
    state STRING,
    zip INTEGER,
    lat FLOAT64 NOT NULL,
    long FLOAT64 NOT NULL,
    city_pop INTEGER,
    job STRING,
    dob STRING
    --dob DATE
);

CREATE TABLE transactions (
    trans_date_trans_time TIMESTAMP NOT NULL,
    cc_num FLOAT64 NOT NULL,
    merchant STRING,
    category STRING,
    amt FLOAT64,
    trans_num STRING,
    unix_time INTEGER NOT NULL,
    merch_lat FLOAT64 NOT NULL,
    merch_long FLOAT64 NOT NULL,
    is_fraud INTEGER
);

CREATE VIEW features as
    SELECT
        DAYOFWEEK(trans_date_trans_time) AS d,
        -- TIMESTAMPDIFF(YEAR, trans_date_trans_time, CAST(dob as TIMESTAMP)) AS age,
        ST_DISTANCE(ST_POINT(long,lat), ST_POINT(merch_long,merch_lat)) AS distance,
        -- TIMESTAMPDIFF(MINUTE, trans_date_trans_time, last_txn_date) AS trans_diff,
        AVG(amt) OVER(
            PARTITION BY   CAST(cc_num AS NUMERIC)
            ORDER BY unix_time
            -- 1 week is 604800  seconds
            RANGE BETWEEN 604800  PRECEDING AND 1 PRECEDING) AS
        avg_spend_pw,
        AVG(amt) OVER(
            PARTITION BY  CAST(cc_num AS NUMERIC)
            ORDER BY unix_time
            -- 1 month(30 days) is 2592000 seconds
            RANGE BETWEEN 2592000 PRECEDING AND 1 PRECEDING) AS
        avg_spend_pm,
        COUNT(*) OVER(
            PARTITION BY  CAST(cc_num AS NUMERIC)
            ORDER BY unix_time
            -- 1 day is 86400  seconds
            RANGE BETWEEN 86400  PRECEDING AND 1 PRECEDING ) AS
        trans_freq_24,
        category,
        amt,
        state,
        job,
        unix_time,
        city_pop,
        merchant,
        is_fraud
    FROM (
        SELECT t1.*, t2.*
               -- , LAG(trans_date_trans_time, 1) OVER (PARTITION BY t1.cc_num  ORDER BY trans_date_trans_time ASC) AS last_txn_date
        FROM  transactions AS t1
        JOIN  demographics AS t2
        ON t1.cc_num = t2.cc_num);""")
print("Compiling project")
project.compile()
print("done")

Compiling project
done


In [None]:
from dbsp import DBSPPipelineConfig, CsvInputFormatConfig, CsvOutputFormatConfig

def run_query(transaction_file: str, output_file: str):
    config = DBSPPipelineConfig(project, 6)

    config.add_file_input(stream = 'DEMOGRAPHICS', filepath = demographics_path, format = CsvInputFormatConfig(), name = 'demographics')
    config.add_file_input(stream = 'TRANSACTIONS', filepath = transaction_file, format = CsvInputFormatConfig(), name = 'transactions')
    config.add_file_output(stream = 'FEATURES', filepath = output_file, format = CsvOutputFormatConfig(), name = 'features')

    config.run_to_completion()

## Compute features on training and testing datasets

In [None]:
run_query(train_path, train_outpath)
print("Training pipeline finished")
run_query(test_path, test_outpath)
print("Test pipeline finished")

Training pipeline finished
Test pipeline finished


## Train XGBoost model

In [6]:
def show_data(cm, print_res = 0):
    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    tn = cm[0,0]
    if print_res == 1:
        pr = tp/(tp+fp)
        rec=  tp/(tp+fn)
        print('Precision =     {:.3f}'.format(pr))
        print('Recall (TPR) =  {:.3f}'.format(rec))
        #print('Fallout (FPR) = {:.3f}'.format(fp/(fp+tn)))
        print('F1 = {:.3f}'.format(2*(pr*rec)/(pr+rec)))
    return tp/(tp+fp), tp/(tp+fn), fp/(fp+tn)

max_depth = 12
n_estimators = 100

traindata     = pd.read_csv(train_outpath, float_precision='round_trip')  
train_dataset = shuffle(traindata)

test_dataset     = pd.read_csv(test_outpath, float_precision='round_trip')  

nb_cols = len(train_dataset.columns.tolist())
    
X_train = train_dataset.iloc[:, 0:nb_cols - 2].values
y_train = train_dataset.iloc[:, nb_cols-2].values.astype(int)        

X_test = test_dataset.iloc[:, 0:nb_cols - 2].values
y_test = test_dataset.iloc[:, nb_cols-2].values.astype(int)    


model = XGBClassifier(max_depth = max_depth,  n_estimators = n_estimators, objective = 'binary:logistic')#, scale_pos_weight= estimate) 
setattr(model, 'verbosity', 0)
model.fit(X_train, y_train)

# evaluate train data
y_pred = model.predict(X_train)
predictions = [round(value) for value in y_pred]
cm = confusion_matrix(y_train, predictions)
show_data(cm, print_res = 1)

# evaluate for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
cm = confusion_matrix(y_test, predictions)
show_data(cm, print_res = 1)

accuracy = accuracy_score(y_test, predictions)
print("XGBoost Accuracy: %.2f%%" % (accuracy * 100.0))

Precision =     1.000
Recall (TPR) =  1.000
F1 = 1.000
Precision =     0.976
Recall (TPR) =  0.915
F1 = 0.944
XGBoost Accuracy: 99.83%


## Inference

In [7]:
run_query(simulation_path, simulation_outpath)

In [8]:
try:
    chunksize = 1024*10
    print(f"reading data from {simulation_outpath}")
    simulation = pd.read_csv(simulation_outpath, iterator=True, chunksize=chunksize)
    for simulation_batch in simulation:
        nb_cols = len(simulation_batch.columns.tolist())        
        X_simulation = simulation_batch.iloc[:, 0:nb_cols - 2].values
        y_simulation = simulation_batch.iloc[:, nb_cols-2].values.astype(int)     

        
        y_pred = model.predict(X_simulation)    
        predictions = [round(value) for value in y_pred]
        cm = confusion_matrix(y_simulation, predictions)
        show_data(cm, print_res = 1)    
        accuracy = accuracy_score(y_simulation, predictions)
        print("XGBoost Accuracy: %.2f%%" % (accuracy * 100.0))
     
except Exception as err:
    print(f"Error: cannot read from the specified source {err}")

reading data from /home/leonid/projects/database-stream-processor/demo/demo_notebooks/fraud_data/simulation_output.csv
Precision =     0.893
Recall (TPR) =  0.885
F1 = 0.889
XGBoost Accuracy: 99.76%
Precision =     0.959
Recall (TPR) =  0.946
F1 = 0.952
XGBoost Accuracy: 99.86%
Precision =     0.993
Recall (TPR) =  0.888
F1 = 0.938
XGBoost Accuracy: 99.81%
Precision =     1.000
Recall (TPR) =  0.881
F1 = 0.937
XGBoost Accuracy: 99.80%
Precision =     0.975
Recall (TPR) =  0.885
F1 = 0.928
XGBoost Accuracy: 99.80%
