In [1]:
import pandas as pd

data = pd.read_csv("../data/processed_data.csv")
data.head()

Unnamed: 0,amount,transaction_hour,foreign_transaction,location_mismatch,device_trust_score,velocity_last_24h,cardholder_age,is_fraud,merchant_category_Electronics,merchant_category_Food,...,merchant_category_Travel,security_level_Medium Security,security_level_High Security,transaction_freq_Medium Transaction,transaction_freq_High Transaction,amount_type_Medium Amount,amount_type_High Amount,transaction_time_Afternoon Transaction,transaction_time_Late Afternoon Transaction,transaction_time_Evening Transaction
0,4.448165,22,0,0,66,3,40,0,1,0,...,0,0,1,0,0,1,0,0,0,1
1,6.296778,3,1,0,87,1,64,0,0,0,...,1,0,1,0,0,0,1,0,0,0
2,5.472313,17,0,0,49,1,61,0,0,0,...,0,1,0,0,0,1,0,0,1,0
3,5.107943,4,0,1,72,3,34,0,0,0,...,0,0,1,0,0,1,0,0,0,0
4,3.450939,15,0,0,79,0,44,0,0,1,...,0,0,1,0,0,0,0,0,1,0


### Splitting the data

In [2]:
X = data.drop("is_fraud", axis=1)
y = data["is_fraud"]

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7500, 20), (2500, 20), (7500,), (2500,))

### Fixing quantity unbalance

In [4]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
from imblearn.over_sampling import SMOTE

smt = SMOTE(random_state=1)

X_final, y_final = smt.fit_resample(X_train, y_train)
X_train, y_train = X_final, y_final

print(y_final.value_counts())

is_fraud
0    7387
1    7387
Name: count, dtype: int64


### Finding the model

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score


models = {"Random Forest": RandomForestClassifier(random_state=1, max_depth=10),
          "KNN": KNeighborsClassifier(),
          "SVC": LinearSVC(random_state=1)}

def evaluate_model(X_train, X_test, y_train, y_test):
    for label, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        print(f"--- {label} Model Scores ---")
        print(f"{classification_report(y_test, y_pred)}")
        print(f"Score: {model.score(X_test, y_test)}")

        print("Train F1:", f1_score(y_train, model.predict(X_train)))
        print("Test F1:", f1_score(y_test, model.predict(X_test)))

In [9]:
evaluate_model(X_train, X_test, y_train, y_test)

--- Random Forest Model Scores ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2462
           1       1.00      0.68      0.81        38

    accuracy                           1.00      2500
   macro avg       1.00      0.84      0.91      2500
weighted avg       1.00      1.00      0.99      2500

Score: 0.9952
Train F1: 1.0
Test F1: 0.8125
--- KNN Model Scores ---
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      2462
           1       0.29      0.58      0.39        38

    accuracy                           0.97      2500
   macro avg       0.64      0.78      0.69      2500
weighted avg       0.98      0.97      0.98      2500

Score: 0.9724
Train F1: 0.9920096689720003
Test F1: 0.3893805309734513
--- SVC Model Scores ---
              precision    recall  f1-score   support

           0       1.00      0.97      0.99      2462
           1       0.35      0.97    

### Tuning the model

In [11]:
from sklearn.model_selection import RandomizedSearchCV

grid = {"max_depth": [None, 10, 40, 60],
        "n_estimators": [10, 100, 200, 500],
        "min_samples_split": [2, 4, 6],
        "min_samples_leaf": [1, 2, 4]}

rsc = RandomizedSearchCV(RandomForestClassifier(),
                        grid,
                        cv=5,
                        n_iter=13,
                        verbose=True,
                        n_jobs=-1)

rsc.fit(X_train, y_train)
print(f"Best params:  {rsc.best_params_}")

Fitting 5 folds for each of 13 candidates, totalling 65 fits
Best params:  {'n_estimators': 100, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_depth': 60}


### Setting up a threshold value because of low recall

In [13]:
model = RandomForestClassifier(n_estimators=100,
                               min_samples_leaf=4,
                               min_samples_split=4,
                               max_depth=60,
                               random_state=1)
model.fit(X_train, y_train)

y_pred_proba = model.predict_proba(X_test)[:, 1]
threshold = 0.30
y_pred = (y_pred_proba >= threshold).astype(int)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      2462
           1       0.54      0.84      0.66        38

    accuracy                           0.99      2500
   macro avg       0.77      0.92      0.83      2500
weighted avg       0.99      0.99      0.99      2500

