In [1]:
# Configuration

# If the code will be run on Google's collaborate platform (requires Drive access.)
USE_GOOGLE_COLLAB = False

# If set to true, minority samples will be corrected to be more common, which
# can lead to generally better learning for the model.
USE_SMOTE = True


In [2]:
DATA_PATH = ''
if USE_GOOGLE_COLLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    DATA_PATH = '/content/drive/MyDrive/creditcard.csv'
else:
    DATA_PATH = 'dataset/creditcard.csv'

You can download the dataset [here](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud)

In [3]:
import time
import pandas as pd
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
# Utilities and metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Preprocessing and normalizing
from sklearn.preprocessing import StandardScaler
if USE_SMOTE:
    from imblearn.over_sampling import SMOTE

In [7]:
data = pd.read_csv(DATA_PATH)

ax = data.drop('Class', axis=1)
ay = data['Class']

if USE_SMOTE:
    smote = SMOTE()
    ax, ay = smote.fit_resample(ax, ay)
else:
    x_res, y_res = ax, ay

In [8]:
x_train, x_test, y_train, y_test = train_test_split(ax, ay, test_size=0.3, random_state=42)

# Normalize everything before doing any training.
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [9]:
# Run a particular model and return a summary of its properties.
def evaluate_model(model, x_train, x_test, y_train, y_test) -> dict:
    start_time = time.time()
    model.fit(x_train, y_train)
    end_time = time.time()

    y_pred = model.predict(x_test)

    results = {
        'Accuracy' : accuracy_score(y_test, y_pred),
        'Precision' : precision_score(y_test, y_pred),
        'Recall' : recall_score(y_test, y_pred),
        'F1' : f1_score(y_test, y_pred),
        'Time' : end_time - start_time,
    }

    return results

### Results

In [10]:
results = {}

models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    #'Random Forest': RandomForestClassifier(),
    #'Gradient Boosting': GradientBoostingClassifier(),
    #'SVC': SVC(),
}

In [12]:
for name, model in models.items():
    print(f'Running: {name} .... ', end='')
    results[name] = evaluate_model(model, x_train, x_test, y_train, y_test)
    print('Done')

results_df = pd.DataFrame(results)
results_df.T # This should display a table in most Jupyter implementations.

Running: Logistic Regression .... 

 Done.
Running: Decision Tree ....  Done.


Unnamed: 0,Accuracy,Precision,Eecall,F1,Time
Logistic Regression,0.980057,0.990834,0.969148,0.979871,7.009765
Decision Tree,0.998241,0.997336,0.999157,0.998246,111.446795
