# Random Forest Model

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data-stage2.csv')
df

Unnamed: 0,category,amt,is_fraud,hour,trans_count_7d,trans_count_30d,time_diff
0,misc_net,4.97,0,1,0.0,0.0,0.000000
1,grocery_pos,107.23,0,1,0.0,0.0,0.000000
2,entertainment,220.11,0,1,0.0,0.0,0.000000
3,gas_transport,45.00,0,1,0.0,0.0,0.000000
4,misc_pos,41.96,0,1,0.0,0.0,0.000000
...,...,...,...,...,...,...,...
1852389,health_fitness,43.77,0,1,39.0,167.0,4.619444
1852390,kids_pets,111.84,0,1,62.0,272.0,2.706389
1852391,kids_pets,86.88,0,1,67.0,277.0,0.201111
1852392,travel,7.99,0,1,36.0,192.0,3.340278


In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn import set_config

preprocessor = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), make_column_selector(dtype_include="float64")),
        ("cat", OneHotEncoder(), make_column_selector(dtype_include="object")),
    ],
    remainder='passthrough'
)

set_config(display="diagram")

preprocessor

In [4]:
X = preprocessor.fit_transform(df.drop(columns=['is_fraud']))
y = df[['is_fraud']]                

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y
)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1481915, 19) (370479, 19) (1481915, 1) (370479, 1)


## Model 1: Imbalanced

Without undersampling or oversampling, directly train the classifier

In [6]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_jobs=-1)
clf.fit(X_train, y_train.to_numpy().reshape(-1))

In [7]:
from sklearn.metrics import classification_report

print("Model 1: Imbalanced\n")

print("On Training Data\n")
print(classification_report(y_train, clf.predict(X_train)))

print("On Testing Data\n")
print(classification_report(y_test, clf.predict(X_test)))

Model 1: Imbalanced

On Training Data

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1474194
           1       1.00      1.00      1.00      7721

    accuracy                           1.00   1481915
   macro avg       1.00      1.00      1.00   1481915
weighted avg       1.00      1.00      1.00   1481915

On Testing Data

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368549
           1       0.88      0.68      0.77      1930

    accuracy                           1.00    370479
   macro avg       0.94      0.84      0.88    370479
weighted avg       1.00      1.00      1.00    370479



## Model 2: Undersampling

In [8]:
from imblearn.under_sampling import RandomUnderSampler

X_us, y_us = RandomUnderSampler().fit_resample(X_train, y_train)

print(X_us.shape, y_us.shape)

(15442, 19) (15442, 1)


In [13]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_jobs=-1)
clf.fit(X_us, y_us.to_numpy().reshape(-1))

In [14]:
from sklearn.metrics import classification_report

print("Model 2: Undersampling\n")

print("On Undersampled Training Data\n")
print(classification_report(y_us, clf.predict(X_us)))

print("On Original Training Data\n")
print(classification_report(y_train, clf.predict(X_train)))

print("On Testing Data\n")
print(classification_report(y_test, clf.predict(X_test)))

Model 2: Undersampling

On Undersampled Training Data

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7721
           1       1.00      1.00      1.00      7721

    accuracy                           1.00     15442
   macro avg       1.00      1.00      1.00     15442
weighted avg       1.00      1.00      1.00     15442

On Original Training Data

              precision    recall  f1-score   support

           0       1.00      0.97      0.99   1474194
           1       0.16      1.00      0.28      7721

    accuracy                           0.97   1481915
   macro avg       0.58      0.99      0.63   1481915
weighted avg       1.00      0.97      0.98   1481915

On Testing Data

              precision    recall  f1-score   support

           0       1.00      0.97      0.99    368549
           1       0.16      0.96      0.27      1930

    accuracy                           0.97    370479
   macro avg       0.58      

## Model 3: Oversampling

In [15]:
from imblearn.over_sampling import SMOTE

X_os, y_os = SMOTE(n_jobs=-1).fit_resample(X_train, y_train)

print(X_os.shape, y_os.shape)

(2948388, 19) (2948388, 1)


In [16]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_jobs=-1)
clf.fit(X_os, y_os.to_numpy().reshape(-1))

In [17]:
from sklearn.metrics import classification_report

print("Model 3: Oversampling\n")

print("On Oversampled Training Data\n")
print(classification_report(y_os, clf.predict(X_os)))

print("On Original Training Data\n")
print(classification_report(y_train, clf.predict(X_train)))

print("On Testing Data\n")
print(classification_report(y_test, clf.predict(X_test)))

Model 3: Oversampling

On Oversampled Training Data

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1474194
           1       1.00      1.00      1.00   1474194

    accuracy                           1.00   2948388
   macro avg       1.00      1.00      1.00   2948388
weighted avg       1.00      1.00      1.00   2948388

On Original Training Data

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1474194
           1       1.00      1.00      1.00      7721

    accuracy                           1.00   1481915
   macro avg       1.00      1.00      1.00   1481915
weighted avg       1.00      1.00      1.00   1481915

On Testing Data

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368549
           1       0.53      0.82      0.64      1930

    accuracy                           1.00    370479
   macro avg       0.76      0.