# This example mimics the following benchmark:

How to build a serverless real-time credit card fraud detection solution
https://cloud.google.com/blog/products/data-analytics/how-to-build-a-fraud-detection-solution

In [1]:
import os
import pandas as pd
from datetime import datetime
import geopy
import geopy.distance
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve, roc_curve
from collections import Counter
from sklearn.utils import shuffle
import warnings
warnings.filterwarnings('ignore')

# Download transaction data from https://drive.google.com/drive/folders/1FVmDB5ZsH_UH-L70uNOkWqZXnnNaXRVK?usp=sharing

Untar downloaded files


Store them in fraud_data folder

In [2]:
train_path           = "/home/leonid/projects/database-stream-processor/demo/demo_notebooks/fraud_data/train.csv"
train_outpath        = "/home/leonid/projects/database-stream-processor/demo/demo_notebooks/fraud_data/train_output.csv"
test_path            = "/home/leonid/projects/database-stream-processor/demo/demo_notebooks/fraud_data/test.csv"
test_outpath         = "/home/leonid/projects/database-stream-processor/demo/demo_notebooks/fraud_data/test_output.csv"
demographics_path    = "/home/leonid/projects/database-stream-processor/demo/demo_notebooks/fraud_data/demographics.csv"
simulation_path      = "/home/leonid/projects/database-stream-processor/demo/demo_notebooks/fraud_data/simulation_short.csv"
simulation_outpath   = "/home/leonid/projects/database-stream-processor/demo/demo_notebooks/fraud_data/simulation_output.csv"
outputdir            = "/home/leonid/projects/database-stream-processor/demo/demo_notebooks/fraud_data/"

train_sample = 10000
test_sample  = 5000
transaction_catagorical_columns = ["category", "merchant"]
demographics_catagorical_columns = ["first", "gender","state", "street","city", "job"]
features_names= ['day', 'age', 'distance', 'category', 'amt', 'state', 'job', 'unix_time',  'city_pop', 'merchant', 'is_fraud']

In [3]:
from dbsp import DBSPConnection

dbsp = DBSPConnection()
project = dbsp.create_or_replace_project(name = "fraud_feature_query", sql_code = """
CREATE TABLE demographics (
    cc_num FLOAT64 NOT NULL,
    first STRING,
    gender STRING,
    street STRING,
    city STRING,
    state STRING,
    zip INTEGER,
    lat FLOAT64 NOT NULL,
    long FLOAT64 NOT NULL,
    city_pop INTEGER,
    job STRING,
    dob STRING
    --dob DATE
);

CREATE TABLE transactions (
    trans_date_trans_time TIMESTAMP NOT NULL,
    cc_num FLOAT64 NOT NULL,
    merchant STRING,
    category STRING,
    amt FLOAT64,
    trans_num STRING,
    unix_time INTEGER NOT NULL,
    merch_lat FLOAT64 NOT NULL,
    merch_long FLOAT64 NOT NULL,
    is_fraud INTEGER
);

CREATE VIEW features as
    SELECT
        DAYOFWEEK(trans_date_trans_time) AS d,
        -- TIMESTAMPDIFF(YEAR, trans_date_trans_time, CAST(dob as TIMESTAMP)) AS age,
        ST_DISTANCE(ST_POINT(long,lat), ST_POINT(merch_long,merch_lat)) AS distance,
        -- TIMESTAMPDIFF(MINUTE, trans_date_trans_time, last_txn_date) AS trans_diff,
        AVG(amt) OVER(
            PARTITION BY   CAST(cc_num AS NUMERIC)
            ORDER BY unix_time
            -- 1 week is 604800  seconds
            RANGE BETWEEN 604800  PRECEDING AND 1 PRECEDING) AS
        avg_spend_pw,
        AVG(amt) OVER(
            PARTITION BY  CAST(cc_num AS NUMERIC)
            ORDER BY unix_time
            -- 1 month(30 days) is 2592000 seconds
            RANGE BETWEEN 2592000 PRECEDING AND 1 PRECEDING) AS
        avg_spend_pm,
        COUNT(*) OVER(
            PARTITION BY  CAST(cc_num AS NUMERIC)
            ORDER BY unix_time
            -- 1 day is 86400  seconds
            RANGE BETWEEN 86400  PRECEDING AND 1 PRECEDING ) AS
        trans_freq_24,
        category,
        amt,
        state,
        job,
        unix_time,
        city_pop,
        merchant,
        is_fraud
    FROM (
        SELECT t1.*, t2.*
               -- , LAG(trans_date_trans_time, 1) OVER (PARTITION BY t1.cc_num  ORDER BY trans_date_trans_time ASC) AS last_txn_date
        FROM  transactions AS t1
        JOIN  demographics AS t2
        ON t1.cc_num = t2.cc_num);""")
print("Compiling project")
project.compile()
print("done")

Compiling project
done


In [4]:
from dbsp import DBSPPipelineConfig, CsvInputFormatConfig, CsvOutputFormatConfig

def run_query(transaction_file: str, output_file: str):
    config = DBSPPipelineConfig(project, 6)

    config.add_file_input(stream = 'DEMOGRAPHICS', filepath = demographics_path, format_ = CsvInputFormatConfig())
    config.add_file_input(stream = 'TRANSACTIONS', filepath = transaction_file, format_ = CsvInputFormatConfig())
    config.add_file_output(stream = 'FEATURES', filepath = output_file, format_ = CsvOutputFormatConfig())

    config.run_to_completion()

In [5]:
run_query(train_path, train_outpath)
print("Training pipeline finished")
run_query(test_path, test_outpath)
print("Test pipeline finished")

Training pipeline finished
Test pipeline finished


# Training XGBoost model

In [6]:
def show_data(cm, print_res = 0):
    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    tn = cm[0,0]
    if print_res == 1:
        pr = tp/(tp+fp)
        rec=  tp/(tp+fn)
        print('Precision =     {:.3f}'.format(pr))
        print('Recall (TPR) =  {:.3f}'.format(rec))
        #print('Fallout (FPR) = {:.3f}'.format(fp/(fp+tn)))
        print('F1 = {:.3f}'.format(2*(pr*rec)/(pr+rec)))
    return tp/(tp+fp), tp/(tp+fn), fp/(fp+tn)

max_depth = 12
n_estimators = 100

traindata     = pd.read_csv(train_outpath, float_precision='round_trip')  
train_dataset = shuffle(traindata)

test_dataset     = pd.read_csv(test_outpath, float_precision='round_trip')  

nb_cols = len(train_dataset.columns.tolist())
    
X_train = train_dataset.iloc[:, 0:nb_cols - 2].values
y_train = train_dataset.iloc[:, nb_cols-2].values.astype(int)        

X_test = test_dataset.iloc[:, 0:nb_cols - 2].values
y_test = test_dataset.iloc[:, nb_cols-2].values.astype(int)    


model = XGBClassifier(max_depth = max_depth,  n_estimators = n_estimators, objective = 'binary:logistic')#, scale_pos_weight= estimate) 
setattr(model, 'verbosity', 0)
model.fit(X_train, y_train)

# evaluate train data
y_pred = model.predict(X_train)
predictions = [round(value) for value in y_pred]
cm = confusion_matrix(y_train, predictions)
show_data(cm, print_res = 1)

# evaluate for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
cm = confusion_matrix(y_test, predictions)
show_data(cm, print_res = 1)

accuracy = accuracy_score(y_test, predictions)
print("XGBoost Accuracy: %.2f%%" % (accuracy * 100.0))

Precision =     1.000
Recall (TPR) =  1.000
F1 = 1.000
Precision =     0.160
Recall (TPR) =  0.792
F1 = 0.266
XGBoost Accuracy: 95.29%


# Inference

In [7]:
run_query(simulation_path, simulation_outpath)

In [8]:
try:
    chunksize = 1024*10
    print(f"reading data from {simulation_outpath}")
    simulation = pd.read_csv(simulation_outpath, iterator=True, chunksize=chunksize)
    for simulation_batch in simulation:
        nb_cols = len(simulation_batch.columns.tolist())        
        X_simulation = simulation_batch.iloc[:, 0:nb_cols - 2].values
        y_simulation = simulation_batch.iloc[:, nb_cols-2].values.astype(int)     

        
        y_pred = model.predict(X_simulation)    
        predictions = [round(value) for value in y_pred]
        cm = confusion_matrix(y_simulation, predictions)
        show_data(cm, print_res = 1)    
        accuracy = accuracy_score(y_simulation, predictions)
        print("XGBoost Accuracy: %.2f%%" % (accuracy * 100.0))

        
except Exception as err:
    print(f"Error: cannot read from the specified source {err}")

reading data from /home/leonid/projects/database-stream-processor/demo/demo_notebooks/fraud_data/simulation_output.csv
Precision =     0.046
Recall (TPR) =  0.945
F1 = 0.088
XGBoost Accuracy: 82.59%
Precision =     0.075
Recall (TPR) =  0.839
F1 = 0.138
XGBoost Accuracy: 83.55%
Precision =     0.078
Recall (TPR) =  0.874
F1 = 0.144
XGBoost Accuracy: 83.86%
Precision =     0.087
Recall (TPR) =  0.831
F1 = 0.158
XGBoost Accuracy: 84.15%
Precision =     0.074
Recall (TPR) =  0.815
F1 = 0.136
XGBoost Accuracy: 82.51%
Precision =     0.063
Recall (TPR) =  0.815
F1 = 0.117
XGBoost Accuracy: 83.79%
Precision =     0.066
Recall (TPR) =  0.837
F1 = 0.122
XGBoost Accuracy: 82.77%
Precision =     0.074
Recall (TPR) =  0.862
F1 = 0.137
XGBoost Accuracy: 83.84%
Precision =     0.072
Recall (TPR) =  0.843
F1 = 0.133
XGBoost Accuracy: 82.95%
Precision =     0.064
Recall (TPR) =  0.830
F1 = 0.119
XGBoost Accuracy: 83.03%
Precision =     0.079
Recall (TPR) =  0.848
F1 = 0.144
XGBoost Accuracy: 83.16%
P

Precision =     0.144
Recall (TPR) =  0.783
F1 = 0.243
XGBoost Accuracy: 94.28%
Precision =     0.128
Recall (TPR) =  0.780
F1 = 0.220
XGBoost Accuracy: 94.10%
Precision =     0.108
Recall (TPR) =  0.747
F1 = 0.189
XGBoost Accuracy: 94.06%
Precision =     0.087
Recall (TPR) =  0.712
F1 = 0.155
XGBoost Accuracy: 94.45%
Precision =     0.116
Recall (TPR) =  0.725
F1 = 0.200
XGBoost Accuracy: 94.85%
Precision =     0.119
Recall (TPR) =  0.739
F1 = 0.205
XGBoost Accuracy: 94.84%
Precision =     0.136
Recall (TPR) =  0.719
F1 = 0.228
XGBoost Accuracy: 95.77%
Precision =     0.167
Recall (TPR) =  0.755
F1 = 0.274
XGBoost Accuracy: 96.16%
Precision =     0.202
Recall (TPR) =  0.821
F1 = 0.324
XGBoost Accuracy: 96.08%
Precision =     0.178
Recall (TPR) =  0.814
F1 = 0.293
XGBoost Accuracy: 96.08%
Precision =     0.176
Recall (TPR) =  0.804
F1 = 0.289
XGBoost Accuracy: 95.68%
Precision =     0.203
Recall (TPR) =  0.792
F1 = 0.323
XGBoost Accuracy: 95.00%
Precision =     0.173
Recall (TPR) =  0.

Precision =     0.180
Recall (TPR) =  0.783
F1 = 0.293
XGBoost Accuracy: 95.58%
Precision =     0.189
Recall (TPR) =  0.764
F1 = 0.303
XGBoost Accuracy: 95.19%
Precision =     0.183
Recall (TPR) =  0.810
F1 = 0.299
XGBoost Accuracy: 95.32%
Precision =     0.205
Recall (TPR) =  0.708
F1 = 0.318
XGBoost Accuracy: 95.44%
Precision =     0.175
Recall (TPR) =  0.778
F1 = 0.286
XGBoost Accuracy: 95.57%
Precision =     0.177
Recall (TPR) =  0.869
F1 = 0.294
XGBoost Accuracy: 95.63%
Precision =     0.187
Recall (TPR) =  0.852
F1 = 0.307
XGBoost Accuracy: 95.68%
Precision =     0.121
Recall (TPR) =  0.753
F1 = 0.208
XGBoost Accuracy: 95.46%
Precision =     0.148
Recall (TPR) =  0.705
F1 = 0.244
XGBoost Accuracy: 95.95%
Precision =     0.167
Recall (TPR) =  0.806
F1 = 0.277
XGBoost Accuracy: 95.77%
Precision =     0.102
Recall (TPR) =  0.842
F1 = 0.182
XGBoost Accuracy: 95.79%
Precision =     0.117
Recall (TPR) =  0.706
F1 = 0.201
XGBoost Accuracy: 96.28%
Precision =     0.153
Recall (TPR) =  0.

Precision =     0.163
Recall (TPR) =  0.765
F1 = 0.268
XGBoost Accuracy: 95.31%
Precision =     0.124
Recall (TPR) =  0.756
F1 = 0.214
XGBoost Accuracy: 95.11%
Precision =     0.143
Recall (TPR) =  0.757
F1 = 0.241
XGBoost Accuracy: 95.01%
Precision =     0.150
Recall (TPR) =  0.760
F1 = 0.251
XGBoost Accuracy: 95.39%
Precision =     0.157
Recall (TPR) =  0.790
F1 = 0.263
XGBoost Accuracy: 95.45%
Precision =     0.132
Recall (TPR) =  0.771
F1 = 0.226
XGBoost Accuracy: 95.71%
Precision =     0.111
Recall (TPR) =  0.776
F1 = 0.195
XGBoost Accuracy: 95.80%
Precision =     0.131
Recall (TPR) =  0.742
F1 = 0.223
XGBoost Accuracy: 95.09%
Precision =     0.136
Recall (TPR) =  0.710
F1 = 0.228
XGBoost Accuracy: 95.31%
Precision =     0.152
Recall (TPR) =  0.781
F1 = 0.255
XGBoost Accuracy: 95.72%
Precision =     0.183
Recall (TPR) =  0.803
F1 = 0.299
XGBoost Accuracy: 94.95%
Precision =     0.189
Recall (TPR) =  0.778
F1 = 0.304
XGBoost Accuracy: 95.31%
Precision =     0.203
Recall (TPR) =  0.

Precision =     0.222
Recall (TPR) =  0.807
F1 = 0.349
XGBoost Accuracy: 95.11%
Precision =     0.177
Recall (TPR) =  0.785
F1 = 0.289
XGBoost Accuracy: 95.10%
Precision =     0.174
Recall (TPR) =  0.806
F1 = 0.287
XGBoost Accuracy: 94.95%
Precision =     0.135
Recall (TPR) =  0.766
F1 = 0.230
XGBoost Accuracy: 95.29%
Precision =     0.155
Recall (TPR) =  0.740
F1 = 0.256
XGBoost Accuracy: 95.62%
Precision =     0.171
Recall (TPR) =  0.792
F1 = 0.281
XGBoost Accuracy: 95.81%
Precision =     0.140
Recall (TPR) =  0.784
F1 = 0.238
XGBoost Accuracy: 95.67%
Precision =     0.112
Recall (TPR) =  0.696
F1 = 0.192
XGBoost Accuracy: 95.49%
Precision =     0.130
Recall (TPR) =  0.741
F1 = 0.221
XGBoost Accuracy: 95.86%
Precision =     0.132
Recall (TPR) =  0.821
F1 = 0.227
XGBoost Accuracy: 95.41%
Precision =     0.157
Recall (TPR) =  0.752
F1 = 0.260
XGBoost Accuracy: 95.61%
Precision =     0.176
Recall (TPR) =  0.822
F1 = 0.290
XGBoost Accuracy: 95.36%
Precision =     0.220
Recall (TPR) =  0.