In [1]:
# Import Libraries
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# import cufflinks as cf
import plotly
import datetime
import math
import matplotlib
import sklearn
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

# Print versions of libraries
print(f"Numpy version : Numpy {np.__version__}")
print(f"Pandas version : Pandas {pd.__version__}")
print(f"Matplotlib version : Matplotlib {matplotlib.__version__}")
print(f"Seaborn version : Seaborn {sns.__version__}")
print(f"SkLearn version : SkLearn {sklearn.__version__}")
# print(f"Cufflinks version : cufflinks {cf.__version__}")
print(f"Plotly version : plotly {plotly.__version__}")

# Magic Functions for In-Notebook Display
%matplotlib inline

# Setting seabon style
sns.set(style='darkgrid', palette='colorblind')

Numpy version : Numpy 1.23.5
Pandas version : Pandas 1.5.3
Matplotlib version : Matplotlib 3.7.0
Seaborn version : Seaborn 0.12.2
SkLearn version : SkLearn 1.2.1
Plotly version : plotly 5.13.0


In [2]:
seed = 42
accuracy = 'average_precision' # equals to area under recall precision curve
lr_max_iterations = 10000 # increased max_iter to allow lbfgs-solver converging
results = []

In [3]:
df = pd.read_parquet("/home/onyxia/work/df_cct_final.parquet")

In [4]:
import random
random.seed(42)
df_fraud = df[df["Is Fraud?"] == 1]
df_not_fraud = df[df["Is Fraud?"] == 0]
df_fraud_list = df_fraud.index.tolist()
df_not_fraud_list = df_not_fraud.index.tolist()

df_fraud_rs_index = random.sample(df_fraud_list, int(len(df_fraud_list)*0.1))
df_fraud_rs = df_fraud.loc[df_fraud_rs_index]
df_not_fraud_rs_index = random.sample(df_not_fraud_list, int(len(df_not_fraud_list)*0.1))
df_not_fraud_rs = df_not_fraud.loc[df_not_fraud_rs_index]
df = pd.concat([df_not_fraud_rs,df_fraud_rs], axis=0).reset_index(drop=True)

In [5]:
from sklearn.preprocessing import OneHotEncoder

# Pour "Use Chip":

enc = OneHotEncoder(handle_unknown='ignore')

features_array = enc.fit_transform(df[["Use Chip"]]).toarray()

features_labels = np.hstack(np.array(enc.categories_))

df = pd.concat([df, pd.DataFrame(features_array, columns = features_labels)], axis = 1)
# Pour "day_of_week" : 

enc = OneHotEncoder(handle_unknown='ignore')

features_array = enc.fit_transform(df[["day_of_week"]]).toarray()

features_labels = np.hstack(np.array(enc.get_feature_names_out()))

df = pd.concat([df, pd.DataFrame(features_array, columns = features_labels)], axis = 1)
# Pour "Card Brand" : 

enc = OneHotEncoder(handle_unknown='ignore')

features_array = enc.fit_transform(df[["Card Brand"]]).toarray()

features_labels = np.hstack(np.array(enc.get_feature_names_out()))

df = pd.concat([df, pd.DataFrame(features_array, columns = features_labels)], axis = 1)
# Pour "Card Type" : 

enc = OneHotEncoder(handle_unknown='ignore')

features_array = enc.fit_transform(df[["Card Type"]]).toarray()

features_labels = np.hstack(np.array(enc.get_feature_names_out()))

df = pd.concat([df, pd.DataFrame(features_array, columns = features_labels)], axis = 1)

selection = ['Card', 'Year', 'Month', 'Day', 'Hours', 'Amount',
       'Credit Limit', 'Year PIN last Changed',
       'delta_t_s', 'delta_t_s_card',
       'amt/daily_income', 'Retired', 'daily_amount',
       'nb_daily_declines_card', 'nb_weekly_declines_card',
       'nb_monthly_declines_card', 'bad_pin', 'insufficient_balance',
       'hr_nbt/last_30d_av_hr_nbt', 'day_nbt/last_30d_av_day_nbt',
       'last_3d_amt/nbt', 'Chip Transaction',
       'Online Transaction', 'Swipe Transaction', 'day_of_week_Friday',
       'day_of_week_Monday', 'day_of_week_Saturday', 'day_of_week_Sunday',
       'day_of_week_Thursday', 'day_of_week_Tuesday', 'day_of_week_Wednesday',
       'Card Brand_Amex', 'Card Brand_Discover', 'Card Brand_Mastercard',
       'Card Brand_Visa', 'Card Type_Credit', 'Card Type_Debit',
       'Card Type_Debit (Prepaid)', 'Is Fraud?']

estimators = ['Card', 'Year', 'Month', 'Day', 'Hours', 'Amount',
       'Credit Limit', 'Year PIN last Changed',
       'delta_t_s', 'delta_t_s_card',
       'amt/daily_income', 'Retired', 'daily_amount',
       'nb_daily_declines_card', 'nb_weekly_declines_card',
       'nb_monthly_declines_card', 'bad_pin', 'insufficient_balance',
       'hr_nbt/last_30d_av_hr_nbt', 'day_nbt/last_30d_av_day_nbt',
       'last_3d_amt/nbt', 'Chip Transaction',
       'Online Transaction', 'Swipe Transaction', 'day_of_week_Friday',
       'day_of_week_Monday', 'day_of_week_Saturday', 'day_of_week_Sunday',
       'day_of_week_Thursday', 'day_of_week_Tuesday', 'day_of_week_Wednesday',
       'Card Brand_Amex', 'Card Brand_Discover', 'Card Brand_Mastercard',
       'Card Brand_Visa', 'Card Type_Credit', 'Card Type_Debit',
       'Card Type_Debit (Prepaid)']
df = df[selection]
#df_selec = df_selec[np.isfinite(df_selec).all(1)]



In [6]:
from sklearn.model_selection import train_test_split

X_orig = df.drop('Is Fraud?', axis=1)
y_orig = df["Is Fraud?"]

X_train, X_test, y_train, y_test = train_test_split(X_orig, y_orig, test_size=0.3,
                                                    random_state=seed, stratify=y_orig)
test_data = [X_test, y_test]

In [7]:
print(f"Training data class counts:\n{y_train.value_counts()}")
print('')
print(f"Test data class counts:\n{y_test.value_counts()}")
print('')
assert(y_test.shape[0]/y_orig.shape[0] > 0.19)

Training data class counts:
0    1705000
1       2082
Name: Is Fraud?, dtype: int64

Test data class counts:
0    730714
1       893
Name: Is Fraud?, dtype: int64



In [8]:
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


In [9]:
preprocessor = ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('std', RobustScaler(), ['Card', 'Year', 'Month', 'Day', 'Hours', 'Amount',
       'Credit Limit', 'Year PIN last Changed',
       'delta_t_s', 'delta_t_s_card',
       'amt/daily_income', 'daily_amount',
       'nb_daily_declines_card', 'nb_weekly_declines_card',
       'nb_monthly_declines_card',
       'hr_nbt/last_30d_av_hr_nbt', 'day_nbt/last_30d_av_day_nbt',
       'last_3d_amt/nbt']), 
        ])

## Model Baseline with class weight

In [10]:
model = Pipeline(steps = [('pr', preprocessor), ('lr', LogisticRegression(class_weight = "balanced", random_state=seed))])
model.fit(X_train,y_train)

In [11]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
cnf_matrix = metrics.confusion_matrix(y_test,y_pred)
cnf_matrix

              precision    recall  f1-score   support

           0       1.00      0.82      0.90    730714
           1       0.01      0.75      0.01       893

    accuracy                           0.82    731607
   macro avg       0.50      0.79      0.45    731607
weighted avg       1.00      0.82      0.90    731607



array([[597315, 133399],
       [   220,    673]])

## Oversampling (SMOTE)

In [16]:
oversampler = SMOTE(sampling_strategy='minority', n_jobs=-1, random_state=seed)
log_reg_os  = LogisticRegressionCV(Cs=5, scoring=accuracy, max_iter=lr_max_iterations, n_jobs=-1, random_state=seed)


In [13]:
!pip install imblearn
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.under_sampling import RandomUnderSampler



In [17]:
model_smote = imbPipeline([
    ('sampler', oversampler),
    ('pr', preprocessor), 
    ('estimator', log_reg_os)])

model_smote.fit(X_train, y_train)

In [18]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix

y_pred_smote = model_smote.predict(X_test)
print(metrics.classification_report(y_test, y_pred_smote))
cnf_matrix_smote = metrics.confusion_matrix(y_test,y_pred_smote)
cnf_matrix_smote

              precision    recall  f1-score   support

           0       1.00      0.84      0.91    730714
           1       0.01      0.71      0.01       893

    accuracy                           0.84    731607
   macro avg       0.50      0.78      0.46    731607
weighted avg       1.00      0.84      0.91    731607



array([[616434, 114280],
       [   258,    635]])

## Undersampling

In [19]:
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=seed)
log_reg_us   = LogisticRegressionCV(Cs=5, scoring=accuracy, max_iter=lr_max_iterations, n_jobs=-1, random_state=seed)

model_us = imbPipeline([
        ('sampler', undersampler),
        ('pr', preprocessor),
        ('estimator', log_reg_us)])
model_us.fit(X_train, y_train)

In [20]:
y_pred_us = model_us.predict(X_test)
print(metrics.classification_report(y_test, y_pred_us))
cnf_matrix_us = metrics.confusion_matrix(y_test,y_pred_us)
cnf_matrix_us

              precision    recall  f1-score   support

           0       1.00      0.83      0.90    730714
           1       0.01      0.75      0.01       893

    accuracy                           0.83    731607
   macro avg       0.50      0.79      0.46    731607
weighted avg       1.00      0.83      0.90    731607



array([[603875, 126839],
       [   223,    670]])

## Combined Sampling

In [14]:
combined_sampler = SMOTETomek(n_jobs=-1, random_state=seed)
log_reg_comb   = LogisticRegressionCV(Cs=5, scoring=accuracy, max_iter=lr_max_iterations, n_jobs=-1, random_state=seed)

model_comb = imbPipeline([
        ('sampler', combined_sampler),
        ('pr', preprocessor),
        ('estimator', log_reg_comb)])

model_comb.fit(X_train, y_train)


In [15]:
y_pred_comb = model_comb.predict(X_test)
print(metrics.classification_report(y_test, y_pred_comb))
cnf_matrix_comb = metrics.confusion_matrix(y_test,y_pred_comb)
cnf_matrix_comb

              precision    recall  f1-score   support

           0       1.00      0.84      0.92    730714
           1       0.01      0.71      0.01       893

    accuracy                           0.84    731607
   macro avg       0.50      0.78      0.46    731607
weighted avg       1.00      0.84      0.91    731607



array([[616505, 114209],
       [   259,    634]])