In [2]:
import os
import sys
from pathlib import Path

lib_dir = Path("..") #folder cotaining dir paysim_analysis
module_path = os.path.abspath(os.path.join(lib_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
import pandas as pd
from paysim_analysis.classification import *
from paysim_analysis.utils import get_project_folder
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, matthews_corrcoef, roc_auc_score, plot_roc_curve

In [5]:
dir_ = get_project_folder().parent / "data" 
target = "isFraud"

In [6]:
#datasets

#tomek_links_df = pd.read_csv(dir_ / "balanced_datasets/tomek_links_balanced.csv")
smote_df = pd.read_csv(dir_ / "balanced_datasets/smote_balanced.csv")
near_miss3_df = pd.read_csv(dir_ / "balanced_datasets/near_miss3_balanced.csv")
df_unbalanced = pd.read_csv(dir_ / "balanced_datasets/normalized.csv")

#print(len(tomek_links_df))
print(len(smote_df))
print(len(near_miss3_df))
print(len(df_unbalanced))

400000
16426
2770409


## SVM on the unbalanced dataset

We use the original dataset (we just apply z-score normalization) and force SVM to penalize mistakes on the minority class by an amount proportional to how under-represented it is.

In [None]:
%%time
cols = [c for c in df_unbalanced.columns if c != target]
X = df_unbalanced[cols]
y = df_unbalanced['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.3)
print(len(df_unbalanced))
# class_weight='balanced' to panalize mistake on minority class
#svc_model = SVC(class_weight='balanced', kernel='linear',  probability=False)
svc_model = LinearSVC(class_weight='balanced')
#scores_unbalanced = cross_validate(svc_model, X, y, scoring=scoring, cv=5,  n_jobs=-1)
svc_model.fit(X_train, y_train)
y_pred = svc_model.predict(X_test)

2770409


In [None]:
%%time
model, y_pred, y_test = svm_on_unbalanced_df(df_unbalanced, target)

In [None]:
print("Preision, Recall, F1, Score :", precision_recall_fscore_support(y_test, y_pred, average='macro'))
print("MCC", matthews_corrcoef(y_test, y_pred))
print("AUC", roc_auc_score(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

## SVM on balanced datasets

### NearMiss 3

In [6]:
%%time
scores, mean_scores = svm_on_balanced_df(near_miss3_df, target, verbose=False)
mean_scores



CPU times: user 79.6 ms, sys: 147 ms, total: 227 ms
Wall time: 11.4 s




{'fit_time': 9.32864682674408,
 'score_time': 0.016270828247070313,
 'test_precision_macro': 0.7987156568678276,
 'test_recall_macro': 0.7720398392560257,
 'test_f1_macro': 0.7658166637439404,
 'test_roc_auc': 0.8442913166736077,
 'test_matthews_corrcoef': 0.5698885417800353}

### Hybrid Approach

In [7]:
%%time
scores, mean_scores = svm_on_balanced_df(smote_df, target, verbose=False)
mean_scores



CPU times: user 118 ms, sys: 85.6 ms, total: 204 ms
Wall time: 3min 25s




{'fit_time': 204.1373915910721,
 'score_time': 0.13260843753814697,
 'test_precision_macro': 0.9178860781562082,
 'test_recall_macro': 0.9167249999999999,
 'test_f1_macro': 0.9166670908707187,
 'test_roc_auc': 0.9753736264999999,
 'test_matthews_corrcoef': 0.8346102638494888}