In [1]:
import os, pickle, json, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LinearRegression
from sklearn.metrics import recall_score, confusion_matrix, classification_report

RAW_CSV   = Path('data/raw/ps_raw.csv')

In [2]:
df_raw = pd.read_csv(RAW_CSV)
print(df_raw.shape)
df_raw.head()

(6362620, 11)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
cat_cols = ['type','nameOrig','nameDest']
encoders = {}
df = df_raw.copy()
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,3,9839.64,757869,170136.0,160296.36,1662094,0.0,0.0,0,0
1,1,3,1864.28,2188998,21249.0,19384.72,1733924,0.0,0.0,0,0
2,1,4,181.0,1002156,181.0,0.0,439685,0.0,0.0,1,0
3,1,1,181.0,5828262,181.0,0.0,391696,21182.0,0.0,1,0
4,1,3,11668.14,3445981,41554.0,29885.86,828919,0.0,0.0,0,0


In [4]:
X = df.drop('isFraud', axis=1).values
y = df['isFraud'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42, shuffle=True)
print({ 'train': X_train.shape, 'test': X_test.shape })
print('Fraud rate train %.4f%% | test %.4f%%' % (y_train.mean()*100, y_test.mean()*100))

{'train': (5090096, 10), 'test': (1272524, 10)}
Fraud rate train 0.1291% | test 0.1291%


In [9]:
def pipe(model):
    return Pipeline([('scaler', StandardScaler()), ('model', model)])

In [10]:
models = {
    'knn': pipe(KNeighborsClassifier(n_neighbors=5, weights='uniform', n_jobs=-1)), # dol el knn
    'lda': pipe(LinearDiscriminantAnalysis()),
    'linreg': pipe(LinearRegression()) #redefined yep sklearn yep
}
for name, mdl in models.items():
    mdl.fit(X_train, y_train)
    print(f'{name} fitted')


knn fitted
lda fitted
linreg fitted


In [12]:
def lr_scores(model, X):
    # pipeline steps: scaler then model
    scaler = model.named_steps['scaler']
    reg    = model.named_steps['model']
    Xs = scaler.transform(X)
    return reg.predict(Xs)

# metrics = {}
# for name, mdl in models.items():
#     if name == 'linreg':
#         # threshold at mean of test scores
#         scores = lr_scores(mdl, X_test)
#         y_pred = (scores > scores.mean()).astype(int)
#     else:
#         y_pred = mdl.predict(X_test)
#     rec = recall_score(y_test, y_pred)
#     metrics[name] = rec
# metrics

In [13]:
knn_pred = models['knn'].predict(X_test)
lda_pred = models['lda'].predict(X_test)
lr_score = lr_scores(models['linreg'], X_test)
lr_mean  = lr_score.mean()

ensemble = np.zeros_like(knn_pred)
for i in range(len(knn_pred)):
    # If "non-fraud" is indicated by either KNN or LDA
    if knn_pred[i] == 0 or lda_pred[i] == 0:
        if lr_score[i] < lr_mean:
            ensemble[i] = 0  # Predict non-fraud
    # If "fraud" is indicated by either KNN or LDA
    elif knn_pred[i] == 1 or lda_pred[i] == 1:
        if lr_score[i] > lr_mean:
            ensemble[i] = 1  # Predict fraud
    # Otherwise, allocate predicted values from KNN to remaining cases
    else:
        ensemble[i] = knn_pred[i]

print('Ensemble recall:', recall_score(y_test, ensemble))
print(confusion_matrix(y_test, ensemble))

Ensemble recall: 0.3377967133292757
[[1270881       0]
 [   1088     555]]


In [9]:
ART_DIR = Path('artifacts'); ART_DIR.mkdir(exist_ok=True)
import joblib, numpy as np
for name, mdl in models.items():
    joblib.dump(mdl, ART_DIR / f'{name}.pkl')
np.save(ART_DIR / 'y_true.npy', y_test)
np.save(ART_DIR / 'y_pred.npy', ensemble)
print('Saved models and predictions to artifacts/')

Saved models and predictions to artifacts/


zero matrix with same shape