In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-2.10.0-py3-none-any.whl (308 kB)
[?25l[K     |█                               | 10 kB 26.3 MB/s eta 0:00:01[K     |██▏                             | 20 kB 30.6 MB/s eta 0:00:01[K     |███▏                            | 30 kB 34.4 MB/s eta 0:00:01[K     |████▎                           | 40 kB 21.7 MB/s eta 0:00:01[K     |█████▎                          | 51 kB 19.0 MB/s eta 0:00:01[K     |██████▍                         | 61 kB 21.6 MB/s eta 0:00:01[K     |███████▍                        | 71 kB 21.9 MB/s eta 0:00:01[K     |████████▌                       | 81 kB 22.9 MB/s eta 0:00:01[K     |█████████▋                      | 92 kB 24.9 MB/s eta 0:00:01[K     |██████████▋                     | 102 kB 24.2 MB/s eta 0:00:01[K     |███████████▊                    | 112 kB 24.2 MB/s eta 0:00:01[K     |████████████▊                   | 122 kB 24.2 MB/s eta 0:00:01[K     |█████████████▉                  | 133 kB 24.2 MB/s eta 0:

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import warnings
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline 

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

from collections import Counter
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.utils import resample
from xgboost import XGBClassifier

import optuna

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
train=pd.read_csv(r"/content/drive/MyDrive/BIDV/CARD_FRAUD/data/trainning/funix_train.csv")
val=pd.read_csv(r"/content/drive/MyDrive/BIDV/CARD_FRAUD/data/trainning/funix_val.csv")
test=pd.read_csv(r"/content/drive/MyDrive/BIDV/CARD_FRAUD/data/trainning/funix_test.csv")

In [5]:
MERCHANT_COLUMNS_PROPERTIES=pd.read_excel(r"/content/drive/MyDrive/BIDV/CARD_FRAUD/data/column_description.xlsx")
def get_list_cat_columns():
    feature_cad = MERCHANT_COLUMNS_PROPERTIES.copy()
    feature_cad = feature_cad[(feature_cad["ORG_SOURCE"].isin(["CAD", "MIS_MRCH_DIM", "MIS_MR_CST_DIM","MIS_CST_FCT","MIS_MRCH_FCT","EXT"]))& (feature_cad["COLUMN_TP"] == "CAT")
    ]
    return feature_cad["COLUMN_NAME"].tolist()

In [6]:
def reduce_memory_usage(df):   
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                    
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

In [7]:

def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == sklearn.pipeline.Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == sklearn.pipeline.Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names

In [8]:
train = reduce_memory_usage(train)
val = reduce_memory_usage(val)
test = reduce_memory_usage(test)

Memory usage of dataframe is 1.166229248046875 MB
Memory usage of dataframe after reduction 0.3381690979003906 MB
Reduced by 71.00320554755986 % 
Memory usage of dataframe is 0.2066650390625 MB
Memory usage of dataframe after reduction 0.058624267578125 MB
Reduced by 71.63319551092735 % 
Memory usage of dataframe is 0.242523193359375 MB
Memory usage of dataframe after reduction 0.06910324096679688 MB
Reduced by 71.50654334969171 % 


In [9]:
train.head()

Unnamed: 0,ID,MER_TYPE,BRANCH,CURR_NUM,DAY_REINSTATED,DAYS_AVGE,FL_LIM1,FL_LIM3,FL_LIM4,GROSS_SALE,...,DAY_CLOSE_YEAR,DAY_CLOSE_MONTH,DAY_CLOSE_DAY,DAY_START_YEAR,DAY_START_MONTH,DAY_START_DAY,DAY_SUBM_YEAR,DAY_SUBM_MONTH,DAY_SUBM_DAY,MERCH_FR
0,2208,5411,126,704,0,1,150,0,5000,671981120.0,...,2099,12,31,1970,1,1,1970,1,1,1
1,53,7011,753,704,0,2,150,0,0,0.0,...,2016,11,14,1970,1,1,1970,1,1,0
2,52,5411,128,704,0,2,150,0,5000,204670160.0,...,2099,12,31,1970,1,1,1970,1,1,0
3,163,5977,741,704,0,3,150,0,0,0.0,...,2016,10,3,1970,1,1,1970,1,1,0
4,657,5411,721,704,0,1,150,0,5000,0.0,...,2020,7,24,1970,1,1,1970,1,1,0


In [10]:
useful_features = [c for c in train.columns if ((c not in ['ID', 'MERCH_FR', 'BRANCH']) and ("DAY_" not in c))] 
# list_cat_org = get_list_cat_columns() 
# object_cols = [c for c in useful_features if c in list_cat_org]
object_cols = ['CURR_NUM', 'MP_IND', 'PAY_METHOD', 'HAS_TXN_LESS_15S']
num_cols = [c for c in useful_features if c not in object_cols]
# df_test = val[useful_features]

In [11]:
x_train = train[useful_features]
x_val = val[useful_features]
x_test = test[useful_features]
y_train = train["MERCH_FR"]
y_val = val["MERCH_FR"]
y_test = test["MERCH_FR"]

In [12]:
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, StandardScaler, OneHotEncoder
from imblearn.pipeline import make_pipeline, Pipeline

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import mean

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler() # ('scaler', MinMaxScaler()    
    )])

# cat_label_cols = ['CST_PERF_ST', 'CST_MKT_SEG']
cat_ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing_val')),
    ('encoder', OrdinalEncoder() 
    )])

# cat_onehot_cols = [f for f in cat_cols if f not in cat_label_cols]
cat_onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing_val')),
    ('encoder', OneHotEncoder(handle_unknown = 'ignore', sparse=False) 
    )])

# over sampling 
oversample = SMOTE() # ADASYN()
preprocessor = ColumnTransformer(
    transformers=[
                  ('num', numeric_transformer, num_cols), 
                  #('cat_label', cat_ordinal_transformer, cat_label_cols)
                  #('cat_onehot', cat_onehot_transformer, ['CLOSE_RSN','CHIP_YN']) # ['CLOSE_RSN', 'COMM_FREQ']
                 ], remainder='drop')

In [13]:
# preprocessor.fit(train[useful_features])
preprocessor.fit(x_train)

x_train_tf = pd.DataFrame(preprocessor.transform(x_train), columns = get_feature_names(preprocessor)) 
x_val_tf = pd.DataFrame(preprocessor.transform(x_val), columns = get_feature_names(preprocessor)) 
x_test_tf = pd.DataFrame(preprocessor.transform(x_test), columns = get_feature_names(preprocessor)) 



In [14]:
def run(trial):
    param = {
        # 'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 4000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = XGBClassifier(**param)      
    
    pipeline = Pipeline(steps=[('sampling', oversample),
                      ('classifier', model)])
    
    pipeline.fit(x_train_tf, y_train, classifier__eval_set=[(x_val_tf,y_val)], classifier__early_stopping_rounds=100, classifier__verbose=False)
    preds_val = model.predict(x_val_tf)
    score = f1_score(y_val, preds_val)
    return score

In [15]:
study = optuna.create_study(direction="maximize")
study.optimize(run, n_trials=100)

[32m[I 2022-04-13 03:43:43,069][0m A new study created in memory with name: no-name-2b486453-a46c-466e-b525-92c6a0461663[0m
[32m[I 2022-04-13 03:43:43,842][0m Trial 0 finished with value: 0.7115384615384616 and parameters: {'lambda': 0.10980336375293036, 'alpha': 0.004078482441728063, 'colsample_bytree': 1.0, 'subsample': 0.5, 'learning_rate': 0.009, 'max_depth': 17, 'random_state': 48, 'min_child_weight': 128}. Best is trial 0 with value: 0.7115384615384616.[0m
[32m[I 2022-04-13 03:43:44,249][0m Trial 1 finished with value: 0.8613861386138613 and parameters: {'lambda': 0.014533270884926888, 'alpha': 0.0038117975970672633, 'colsample_bytree': 0.7, 'subsample': 1.0, 'learning_rate': 0.016, 'max_depth': 5, 'random_state': 48, 'min_child_weight': 204}. Best is trial 1 with value: 0.8613861386138613.[0m
[32m[I 2022-04-13 03:43:45,166][0m Trial 2 finished with value: 0.916256157635468 and parameters: {'lambda': 0.2981103287578025, 'alpha': 0.10391178269531394, 'colsample_bytree':

In [16]:
'''
Trial 30 finished with value: 0.964467005076142
{'max_depth': 10,
 'max_leaf_nodes': 5,
 'min_samples_split': 4,
 'n_estimators': 316}

 '''

study.best_params

{'alpha': 0.7881845480038765,
 'colsample_bytree': 1.0,
 'lambda': 0.0038818469818502105,
 'learning_rate': 0.02,
 'max_depth': 15,
 'min_child_weight': 1,
 'random_state': 2020,
 'subsample': 0.6}

In [17]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

Number of finished trials: 100
Best trial: {'lambda': 0.0038818469818502105, 'alpha': 0.7881845480038765, 'colsample_bytree': 1.0, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 1}


In [21]:
best_params = study.best_params
model = XGBClassifier(
        #random_state=42,        
        **best_params
    )
pipeline = Pipeline(steps=[('sampling', oversample),
                    ('classifier', model)])

pipeline.fit(x_train_tf, y_train)
preds_test = model.predict(x_test_tf)
f1score = f1_score(y_test, preds_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, preds_test))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       225
           1       0.96      0.96      0.96       113

    accuracy                           0.97       338
   macro avg       0.97      0.97      0.97       338
weighted avg       0.97      0.97      0.97       338



In [22]:
#plot_optimization_histor: shows the scores from all trials as well as the best score so far at each point.
# optuna.visualization.plot_optimization_history(study)

In [23]:
#plot_parallel_coordinate: interactively visualizes the hyperparameters and scores
# optuna.visualization.plot_parallel_coordinate(study)

In [24]:
'''plot_slice: shows the evolution of the search. You can see where in the hyperparameter space your search
went and which parts of the space were explored more.'''
# optuna.visualization.plot_slice(study)

'plot_slice: shows the evolution of the search. You can see where in the hyperparameter space your search\nwent and which parts of the space were explored more.'

In [25]:
#plot_contour: plots parameter interactions on an interactive chart. You can choose which hyperparameters you would like to explore.
# optuna.visualization.plot_contour(study, params=['alpha',
#                             #'max_depth',
#                             'lambda',
#                             'subsample',
#                             'learning_rate',
#                             'subsample'])

In [26]:
#Visualize parameter importances.
# optuna.visualization.plot_param_importances(study)

In [27]:
#Visualize empirical distribution function
# optuna.visualization.plot_edf(study)

# Pipepline 2, handle category value

In [28]:
# cat_label_cols = ['CST_PERF_ST', 'CST_MKT_SEG']
cat_ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('encoder', OrdinalEncoder() 
    )])

# cat_onehot_cols = [f for f in cat_cols if f not in cat_label_cols]
cat_onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing_val')),
    ('encoder', OneHotEncoder(handle_unknown = 'ignore', sparse=False) 
    )])

preprocessor = ColumnTransformer(
    transformers=[
                  ('num', numeric_transformer, num_cols), 
                  ('cat_label', cat_ordinal_transformer, object_cols)
                  #('cat_onehot', cat_onehot_transformer, ['CLOSE_RSN','CHIP_YN']) # ['CLOSE_RSN', 'COMM_FREQ']
                 ], remainder='drop')

In [29]:
preprocessor.fit(x_train)

x_train_tf = pd.DataFrame(preprocessor.transform(x_train), columns = get_feature_names(preprocessor)) 
x_val_tf = pd.DataFrame(preprocessor.transform(x_val), columns = get_feature_names(preprocessor)) 
x_test_tf = pd.DataFrame(preprocessor.transform(x_test), columns = get_feature_names(preprocessor)) 



In [30]:
def run(trial):
    param = {
        # 'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 4000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = XGBClassifier(**param)      
    
    pipeline = Pipeline(steps=[('sampling', oversample),
                      ('classifier', model)])
    
    pipeline.fit(x_train_tf, y_train, classifier__eval_set=[(x_val_tf,y_val)], classifier__early_stopping_rounds=100, classifier__verbose=False)
    preds_val = model.predict(x_val_tf)
    score = f1_score(y_val, preds_val)
    return score

In [31]:
study = optuna.create_study(direction="maximize")
study.optimize(run, n_trials=100)

[32m[I 2022-04-13 04:19:10,937][0m A new study created in memory with name: no-name-e39f7f0f-41d4-4f62-a2b1-d108c15e0212[0m
[32m[I 2022-04-13 04:19:11,250][0m Trial 0 finished with value: 0.0 and parameters: {'lambda': 0.014627511306210766, 'alpha': 0.0019237498198748158, 'colsample_bytree': 0.4, 'subsample': 0.7, 'learning_rate': 0.009, 'max_depth': 7, 'random_state': 24, 'min_child_weight': 198}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-04-13 04:19:11,650][0m Trial 1 finished with value: 0.0 and parameters: {'lambda': 2.084092446820115, 'alpha': 0.00864363837210922, 'colsample_bytree': 1.0, 'subsample': 0.6, 'learning_rate': 0.018, 'max_depth': 13, 'random_state': 2020, 'min_child_weight': 298}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-04-13 04:19:13,983][0m Trial 2 finished with value: 0.9489795918367346 and parameters: {'lambda': 3.0323617707398065, 'alpha': 0.0336419107669075, 'colsample_bytree': 0.9, 'subsample': 0.6, 'learning_rate': 0.016, 'max_depth': 

In [32]:
'''
0.9595959595959597 
{'alpha': 0.020230097237001404,
 'colsample_bytree': 0.3,
 'lambda': 0.276674464175535,
 'learning_rate': 0.008,
 'max_depth': 11,
 'min_child_weight': 6,
 'random_state': 48,
 'subsample': 0.8}

'''
study.best_params

{'alpha': 0.027965096689938844,
 'colsample_bytree': 0.3,
 'lambda': 0.14078870210197214,
 'learning_rate': 0.018,
 'max_depth': 11,
 'min_child_weight': 1,
 'random_state': 48,
 'subsample': 0.6}

In [35]:
best_params = study.best_params
model = XGBClassifier(        
        **best_params
    )
pipeline = Pipeline(steps=[('sampling', oversample),
                    ('classifier', model)])

pipeline.fit(x_train_tf, y_train)
preds_test = model.predict(x_test_tf)
f1score = f1_score(y_test, preds_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, preds_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       225
           1       0.97      0.99      0.98       113

    accuracy                           0.99       338
   macro avg       0.98      0.99      0.98       338
weighted avg       0.99      0.99      0.99       338

