In [1]:
#import preliminaries 

import pandas as pd
import numpy as np
import random
# import cuml 
import matplotlib.pyplot as plt
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingGridSearchCV
from sklearn.svm import SVC #for windows to use on cpu cores 
#from cuml.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, precision_recall_curve,confusion_matrix, ConfusionMatrixDisplay
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler


In [2]:
#load csv files 

train_df = pd.read_csv(r"C:\Users\kbarn\ubuntu_shared\fraudTrain.csv")
test_df = pd.read_csv(r"C:\Users\kbarn\ubuntu_shared\fraudTest.csv")


In [3]:
#preprocess the dataframe, drop cc number and convert transaction time to hours 

for df in [train_df, test_df]:
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['hour'] = df['trans_date_trans_time'].dt.hour
    df.drop(columns=['trans_date_trans_time', 'cc_num'], inplace=True)
    

In [4]:
# define numerical and categorical data 

numerical_features = ['amt', 'hour', 'merch_lat', 'merch_long']
categorical_features = ['category', 'job']

In [5]:
# normaize numerical data and onehotencode categorical 

preprocessor = ColumnTransformer([("numerical", StandardScaler(),numerical_features),
    ("categorical", OneHotEncoder(handle_unknown = "ignore"),categorical_features)])

In [6]:
# build the pipeline with imblearn 

pipeline = ImbPipeline([
    ('preprocessor' , preprocessor),
    ('under_sampler' , RandomUnderSampler(sampling_strategy = 0.1, random_state = 42)),
    ('over_sampler' , SMOTE(sampling_strategy = 0.5, random_state = 42)),
    ('svm' ,  SVC(probability=True, class_weight='balanced',random_state = 42))])

In [7]:
#define training data 

X_train = train_df.drop(columns = ['is_fraud'])
Y_train = train_df['is_fraud']
X_test = test_df.drop(columns = ["is_fraud"])
Y_test = test_df['is_fraud']

In [8]:
#hyperparameters of the SVM

hyper_parameters = { 'svm__C' : [0.3], # possible options for margin 
                    'svm__kernel' : ['linear'] } # we use a linear kernel for cheap compute 

                    

In [None]:
#initializing halving grid search 

halving_grid_search = HalvingGridSearchCV(
    estimator= pipeline,
    param_grid = hyper_parameters,
    factor=2,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1)

halving_grid_search.fit(X_train, Y_train)
optimal_model = halving_grid_search.best_estimator_


n_iterations: 1
n_required_iterations: 1
n_possible_iterations: 1
min_resources_: 1296675
max_resources_: 1296675
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 1
n_resources: 1296675
Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [None]:
# retrieve model predictions 

y_pred = optimal_model.predict(X_test)
y_proba = optimal_model.predict_proba(X_test)[:, 1]

In [None]:
#visualizing the data 

print(classification_report(Y_test, y_pred, digits=6)) # area under the curve (AUC) 
print(f"ROC-AUC Score: {roc_auc_score(Y_test, y_proba)}")

confusionmatrix = confusion_matrix(Y_test, (y_proba > 0.5).astype(int)) # confusion matrix 
disp = ConfusionMatrixDisplay(confusion_matrix=confusionmatrix, display_labels=["Not Fraud", "Fraud"])
disp.plot(cmap="Blues", values_format="d")
plt.show()

fpr, tpr, _ = roc_curve(Y_test, y_proba) #ROC and Recall 
plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(Y_test, y_proba):.2f}")
plt.show()

precision, recall, _ = precision_recall_curve(Y_test, y_proba)
plt.plot(recall, precision)
plt.show()

false_positive = confusionmatrix[0, 1]  # use confusion matrix as sourc 
false_negative = confusionmatrix[1, 0]  
fn_fp = false_positive / false_negative 
print (f'ratio of false positives to false negatives {fn_fp}')