In [39]:
import time
from typing import List

import eli5
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from category_encoders import TargetEncoder
from lightgbm import LGBMClassifier
from matplotlib import pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from xgboost import XGBClassifier




In [6]:
# plt.rcParams['figure.facecolor'] = 'white'

In [7]:
# %load_ext autoreload
# %autoreload 2

#### Custom Transformers

In [8]:
# Define custom transformer to compute ratio of two columns
class FraudFeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            handle_unknown: str = "ignore",
            unknown_value: float = 0.0,
            ratio_upper_threshold: float = 100,
            ratio_lower_threshold: float = 0,
            algorithm: str = "linear",
    ):
        self.handle_unknown = handle_unknown
        self.unknown_value = unknown_value
        self.ratio_upper_threshold = ratio_upper_threshold
        self.ratio_lower_threshold = ratio_lower_threshold
        self.algorithm = algorithm

        self.epsilon = 1e-4
        self.numeric_features = [
            'amount',
            'oldbalanceOrg',
            'newbalanceOrig',
            'oldbalanceDest',
            'newbalanceDest',
        ]

        self.medians = None
        self.means = None
        self.feature_names_out = None

    def _compute_ratio_(self, X: pd.DataFrame, x_col: str, y_col: str) -> pd.Series:
        ratios = X[x_col] / X[y_col]

        # return pd.Series(np.where(ratios < self.ratio_threshold, self.ratio_threshold, ratios))
        return ratios.apply(lambda x: np.where(x > self.ratio_upper_threshold, self.ratio_upper_threshold, x)).apply(
            lambda x: np.where(x < self.ratio_lower_threshold, self.ratio_lower_threshold, x))

    def fit(self, X, y=None) -> None:
        self.means = {feat: X[feat].mean() for feat in self.numeric_features}
        self.medians = {feat: X[feat].median() for feat in self.numeric_features}

        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:

        new_df = pd.DataFrame()

        for feat in self.numeric_features:
            if feat != 'amount':
                new_df[f'{feat}_amount_ratio'] = self._compute_ratio_(X, feat, 'amount')

            if self.algorithm == 'linear':
                new_df[f'{feat}_is_zero'] = (X[feat] > self.epsilon).astype(int)
                new_df[f'{feat}_is_bigger_mean'] = (X[feat] > self.means[feat]).astype(int)
                new_df[f'{feat}_is_bigger_median'] = (X[feat] > self.medians[feat]).astype(int)

        new_df['dest_is_bigger_org'] = (X['oldbalanceDest'] > X['oldbalanceOrg']).astype(int)

        self.feature_names_out = new_df.columns.tolist()

        return new_df

    def get_feature_names_out(self, input_features=None):
        return self.feature_names_out

In [9]:
class LogTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, handle_inf="median", fill_value=0):
        self.handle_inf = handle_inf
        self.fill_value = fill_value
        self.feature_names_out = None

    def fit(self, X, y=None):

        if self.handle_inf == "median":
            logged_X = np.log(X)
            self.fill_value = np.nanmedian(logged_X, axis=0)
        elif self.handle_inf == "mean":
            logged_X = np.log(X)
            self.fill_value = np.nanmean(logged_X, axis=0)
        elif self.handle_inf == "const":
            pass

        self.feature_names_out = [f"{feat}_log" for feat in X.columns]

        return self

    def transform(self, X):
        logged_X = np.log(X)

        return np.where(np.isinf(logged_X), np.expand_dims(self.fill_value, axis=0), logged_X)

    def get_feature_names_out(self, input_features=None):
        if input_features is not None:
            return input_features

        return self.feature_names_out


In [10]:
class TargetEncoderFixed(TargetEncoder):
    def get_feature_names_out(self, *arg, **kargs) -> List[str]:
        return self.feature_names_out_

#### Loading data

In [11]:
fraud = pd.read_csv(r'onlinefraud\onlinefraud.csv', nrows=500000)  #nrows=100000

In [12]:
d = fraud.head(1000)

In [13]:
data = fraud

#### Features

In [14]:
features_to_exclude = [
    'isFlaggedFraud',
    'nameOrig',
    'nameDest',
]

In [15]:
features_to_exclude

['isFlaggedFraud', 'nameOrig', 'nameDest']

In [16]:
target_features = ['isFraud', 'isFlaggedFraud']

target = "isFraud"

In [17]:
features = [col for col in data.columns if col not in target_features + features_to_exclude]

In [18]:
data.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [19]:
numeric_features = [
    "amount",
    "oldbalanceOrg",
    "newbalanceOrig",
    "oldbalanceDest",
    "newbalanceDest",
]

In [20]:
nominal_features = [
    "type",
]

In [21]:
data[features].head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
0,1,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0
1,1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0
4,1,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0


In [22]:
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


#### Pipelines

In [23]:
linear_generator = ColumnTransformer(
    transformers=[
        ('ffg', FraudFeatureGenerator(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown="ignore"), nominal_features),
    ],
    remainder='drop'
)

linear_preprocessor = ColumnTransformer(
    transformers=[
        ('log', LogTransformer(handle_inf='const', fill_value=0), numeric_features),
        ('scl', StandardScaler(), numeric_features),
    ],
    remainder='drop'
)

liner_data_pipeline = ColumnTransformer(
    transformers=[
        ('generator', linear_generator, features),
        ('preprocessor', linear_preprocessor, features),
    ],
    remainder='drop'
)

# Define the pipeline
pipeline = Pipeline([
    ('general_proc', liner_data_pipeline),
    ('regressor', LogisticRegression())
])

In [24]:
tree_generator = ColumnTransformer(
    transformers=[
        ('ffg', FraudFeatureGenerator(algorithm='tree'), numeric_features),
        ('cat', TargetEncoderFixed(handle_unknown="median"), nominal_features),
    ],
    remainder='drop'
)

tree_preprocessor = ColumnTransformer(
    transformers=[
        ('imputer', SimpleImputer(fill_value=-1), numeric_features),
    ],
    remainder='drop'
)

tree_data_pipeline = ColumnTransformer(
    transformers=[
        ('generator', tree_generator, features),
        ('preprocessor', tree_preprocessor, features),
    ],
    remainder='drop'
)

# Define the pipeline
pipeline = Pipeline([
    ('general_proc', tree_data_pipeline),
    ('regressor', RandomForestClassifier())
])


##### test transformers

In [25]:
tree_generator.fit_transform(d, d['isFraud'])

array([[17.2908765 , 16.2908765 ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [11.39796597, 10.39796597,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.0403924 ],
       ...,
       [ 5.45425101,  4.45425101,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 7.7631187 ,  6.7631187 ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.23593371,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ]])

In [26]:
tree_generator.get_feature_names_out()

array(['ffg__oldbalanceOrg_amount_ratio',
       'ffg__newbalanceOrig_amount_ratio',
       'ffg__oldbalanceDest_amount_ratio',
       'ffg__newbalanceDest_amount_ratio', 'ffg__dest_is_bigger_org',
       'cat__type'], dtype=object)

In [23]:
# generator.fit_transform(d)

In [24]:
# generator.get_feature_names_out()

In [25]:
# preprocessor.fit_transform(d)

In [26]:
pipeline.fit(d, d['isFraud'])

In [27]:
eli5.explain_weights_df(pipeline[-1], feature_names=pipeline[:-1].get_feature_names_out())

Unnamed: 0,feature,weight,std
0,generator__ffg__oldbalanceOrg_amount_ratio,0.311519,0.311484
1,preprocessor__imputer__amount,0.162009,0.16902
2,preprocessor__imputer__oldbalanceDest,0.123631,0.182555
3,generator__ffg__oldbalanceDest_amount_ratio,0.116195,0.185613
4,preprocessor__imputer__oldbalanceOrg,0.079014,0.133722
5,generator__cat__type,0.064649,0.108893
6,generator__ffg__newbalanceDest_amount_ratio,0.045492,0.097958
7,preprocessor__imputer__newbalanceDest,0.044861,0.084029
8,generator__ffg__newbalanceOrig_amount_ratio,0.02193,0.096987
9,generator__ffg__dest_is_bigger_org,0.021797,0.06491


#### Models

In [28]:
dummy_mf = make_pipeline(
    DummyClassifier(strategy='most_frequent'),
)
dummy_strf = make_pipeline(
    DummyClassifier(strategy='stratified'),
)

In [29]:

lr = Pipeline([
    ('general_proc', liner_data_pipeline),
    ('regressor', LogisticRegression())
])

svm = Pipeline([
    ('general_proc', liner_data_pipeline),
    ('regressor', SVC())
])

mlp = Pipeline([
    ('general_proc', liner_data_pipeline),
    ('regressor', MLPClassifier())
])

In [30]:


ctb = Pipeline([
    ('general_proc', tree_data_pipeline),
    ('regressor', CatBoostClassifier())
])
xgb = Pipeline([
    ('general_proc', tree_data_pipeline),
    ('regressor', XGBClassifier())
])
lgbm = Pipeline([
    ('general_proc', tree_data_pipeline),
    ('regressor', LGBMClassifier())
])
rfc = Pipeline([
    ('general_proc', tree_data_pipeline),
    ('regressor', RandomForestClassifier())
])

In [58]:
pipelines = [
    ("Dummy_mf", dummy_mf),
    ("Dummy_strf", dummy_strf),
    ("LinReg", lr),
    ("XGB", xgb),
    ("LGBM", lgbm),
    ("RFC", rfc),
    ("CatBoost", ctb),
    ("MLP", mlp),
    # ("SVM", svm),
]

#### Data split

In [69]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, stratify=data[target])

In [70]:
X_train.shape, X_test.shape

((400000, 7), (100000, 7))

In [34]:
y_train.value_counts()

0    399814
1       186
Name: isFraud, dtype: int64

In [68]:
y_test.value_counts()

0    99953
1       47
Name: isFraud, dtype: int64

In [36]:
dummy_mf.fit(X_train, y_train)
dummy_strf.fit(X_train, y_train)

In [42]:
model = mlp

In [43]:
model.fit(X_train, y_train)

  result = func(self.values, **kwargs)


In [44]:
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))

print("Confusion matrix:")
print(cm)



  result = func(self.values, **kwargs)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     99953
           1       1.00      0.62      0.76        47

    accuracy                           1.00    100000
   macro avg       1.00      0.81      0.88    100000
weighted avg       1.00      1.00      1.00    100000

Confusion matrix:
[[99953     0]
 [   18    29]]


In [45]:
y_pred = model.predict(X_train)
cm = confusion_matrix(y_train, y_pred)

print(classification_report(y_train, y_pred))

print("Confusion matrix:")
print(cm)

  result = func(self.values, **kwargs)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00    399814
           1       0.97      0.58      0.73       186

    accuracy                           1.00    400000
   macro avg       0.99      0.79      0.86    400000
weighted avg       1.00      1.00      1.00    400000

Confusion matrix:
[[399811      3]
 [    78    108]]


In [55]:
weights = eli5.explain_weights_df(model[-1], feature_names=model[:-1].get_feature_names_out())

In [56]:
weights

#### Advanced (don't run)

In [59]:
from notebooks.Dzim.data_mining.validation.training import estimate_multiple_models
from notebooks.Dzim.data_mining.validation.training import estimate_model

#### training
model_names, model_pipelines = zip(*pipelines)
metrics, weights = estimate_multiple_models(
    model_pipelines,
    lambda pipeline, _: estimate_model(
        pipeline,
        (X_train, X_test, y_train, y_test),
    ),
    model_names
)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  result = func(self.values, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


Learning rate set to 0.13305
0:	learn: 0.2696743	total: 49.7ms	remaining: 49.7s
1:	learn: 0.1087424	total: 96.3ms	remaining: 48s
2:	learn: 0.0412317	total: 140ms	remaining: 46.7s
3:	learn: 0.0180308	total: 191ms	remaining: 47.7s
4:	learn: 0.0086005	total: 243ms	remaining: 48.4s
5:	learn: 0.0049101	total: 293ms	remaining: 48.6s
6:	learn: 0.0029461	total: 347ms	remaining: 49.3s
7:	learn: 0.0019898	total: 394ms	remaining: 48.9s
8:	learn: 0.0014412	total: 441ms	remaining: 48.6s
9:	learn: 0.0012119	total: 489ms	remaining: 48.4s
10:	learn: 0.0010730	total: 530ms	remaining: 47.6s
11:	learn: 0.0009188	total: 579ms	remaining: 47.7s
12:	learn: 0.0008855	total: 629ms	remaining: 47.8s
13:	learn: 0.0007995	total: 680ms	remaining: 47.9s
14:	learn: 0.0007460	total: 725ms	remaining: 47.6s
15:	learn: 0.0007060	total: 772ms	remaining: 47.5s
16:	learn: 0.0006729	total: 817ms	remaining: 47.2s
17:	learn: 0.0006645	total: 865ms	remaining: 47.2s
18:	learn: 0.0006446	total: 914ms	remaining: 47.2s
19:	learn: 0

  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


In [60]:
metrics

set,train,train,train,train,test,test,test,test
metric,f1_score,cohen_kappa_score,precision_score,recall_score,f1_score,cohen_kappa_score,precision_score,recall_score
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Dummy_mf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dummy_strf,0.005,0.005,0.005,0.005,0.022,0.022,0.023,0.021
LinReg,0.588,0.588,0.93,0.43,0.658,0.657,0.923,0.511
XGB,0.981,0.981,1.0,0.962,0.956,0.956,1.0,0.915
LGBM,0.04,0.039,0.028,0.07,0.0,-0.001,0.0,0.0
RFC,1.0,1.0,1.0,1.0,0.956,0.956,1.0,0.915
CatBoost,0.989,0.989,1.0,0.978,0.805,0.805,0.875,0.745
MLP,0.714,0.714,0.955,0.57,0.72,0.72,0.964,0.574


In [49]:
print(weights)

                                                       weight
model    feature                                             
LinReg   generator__ffg__oldbalanceOrg_amount_ratio     5.090
         generator__cat__type_TRANSFER                  4.133
         generator__ffg__oldbalanceOrg_is_bigger_mean   2.925
         generator__cat__type_CASH_OUT                  2.765
         preprocessor__scl__oldbalanceOrg               1.874
...                                                       ...
CatBoost preprocessor__imputer__oldbalanceDest          0.063
         generator__ffg__newbalanceDest_amount_ratio    0.061
         generator__ffg__newbalanceOrig_amount_ratio    0.045
         preprocessor__imputer__newbalanceOrig          0.031
         generator__ffg__dest_is_bigger_org             0.017

[69 rows x 1 columns]


In [56]:
for name, _ in pipelines:
    try:
        print("#" * 80)
        print(name)
        print(weights.loc[name])
    except KeyError:
        print('Not Supported')

################################################################################
Dummy_mf
Not Supported
################################################################################
Dummy_strf
Not Supported
################################################################################
LinReg
                                                 weight
feature                                                
generator__ffg__oldbalanceOrg_amount_ratio        5.090
generator__cat__type_TRANSFER                     4.133
generator__ffg__oldbalanceOrg_is_bigger_mean      2.925
generator__cat__type_CASH_OUT                     2.765
preprocessor__scl__oldbalanceOrg                  1.874
generator__ffg__amount_is_bigger_mean             0.620
preprocessor__scl__oldbalanceDest                 0.563
preprocessor__scl__amount                         0.297
preprocessor__log__oldbalanceOrg                  0.186
generator__ffg__dest_is_bigger_org                0.155
generator__ffg__newbalanceDest

##### Hyperparameter optimization

In [96]:
processed_data = tree_data_pipeline.fit_transform(data[features], data[target])

In [97]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_data, data[target], test_size=0.2, stratify=data[target])
# X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

In [98]:
X_train.shape, X_test.shape  # X_cv.shape

((400000, 11), (100000, 11))

In [99]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import time

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [20, 50, 100, 200],
    'max_depth': [None, 2, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create a RandomForestClassifier object
rfc = RandomForestClassifier()

In [None]:

# Create a GridSearchCV object and fit the data
start = time.time()
grid_search = GridSearchCV(rfc, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and accuracy score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy: ", grid_search.best_score_)
end = time.time()
print(f"Computation tmie: {end-start}")