In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-2.10.0-py3-none-any.whl (308 kB)
[?25l[K     |█                               | 10 kB 27.3 MB/s eta 0:00:01[K     |██▏                             | 20 kB 15.1 MB/s eta 0:00:01[K     |███▏                            | 30 kB 7.0 MB/s eta 0:00:01[K     |████▎                           | 40 kB 6.4 MB/s eta 0:00:01[K     |█████▎                          | 51 kB 4.3 MB/s eta 0:00:01[K     |██████▍                         | 61 kB 5.0 MB/s eta 0:00:01[K     |███████▍                        | 71 kB 5.2 MB/s eta 0:00:01[K     |████████▌                       | 81 kB 5.3 MB/s eta 0:00:01[K     |█████████▋                      | 92 kB 5.9 MB/s eta 0:00:01[K     |██████████▋                     | 102 kB 5.1 MB/s eta 0:00:01[K     |███████████▊                    | 112 kB 5.1 MB/s eta 0:00:01[K     |████████████▊                   | 122 kB 5.1 MB/s eta 0:00:01[K     |█████████████▉                  | 133 kB 5.1 MB/s eta 0:00:01[K  

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import warnings
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline 

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

from collections import Counter
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.utils import resample
from xgboost import XGBClassifier

import optuna

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
train=pd.read_csv(r"/content/drive/MyDrive/BIDV/CARD_FRAUD/data/trainning/funix_train.csv")
val=pd.read_csv(r"/content/drive/MyDrive/BIDV/CARD_FRAUD/data/trainning/funix_val.csv")
test=pd.read_csv(r"/content/drive/MyDrive/BIDV/CARD_FRAUD/data/trainning/funix_test.csv")

In [5]:
def reduce_memory_usage(df):   
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                    
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

In [6]:

def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == sklearn.pipeline.Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == sklearn.pipeline.Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names

In [7]:
train = reduce_memory_usage(train)
val = reduce_memory_usage(val)
test = reduce_memory_usage(test)

Memory usage of dataframe is 1.166229248046875 MB
Memory usage of dataframe after reduction 0.3381690979003906 MB
Reduced by 71.00320554755986 % 
Memory usage of dataframe is 0.2066650390625 MB
Memory usage of dataframe after reduction 0.058624267578125 MB
Reduced by 71.63319551092735 % 
Memory usage of dataframe is 0.242523193359375 MB
Memory usage of dataframe after reduction 0.06910324096679688 MB
Reduced by 71.50654334969171 % 


In [8]:
train.head()

Unnamed: 0,ID,MER_TYPE,BRANCH,CURR_NUM,DAY_REINSTATED,DAYS_AVGE,FL_LIM1,FL_LIM3,FL_LIM4,GROSS_SALE,...,DAY_CLOSE_YEAR,DAY_CLOSE_MONTH,DAY_CLOSE_DAY,DAY_START_YEAR,DAY_START_MONTH,DAY_START_DAY,DAY_SUBM_YEAR,DAY_SUBM_MONTH,DAY_SUBM_DAY,MERCH_FR
0,2208,5411,126,704,0,1,150,0,5000,671981120.0,...,2099,12,31,1970,1,1,1970,1,1,1
1,53,7011,753,704,0,2,150,0,0,0.0,...,2016,11,14,1970,1,1,1970,1,1,0
2,52,5411,128,704,0,2,150,0,5000,204670160.0,...,2099,12,31,1970,1,1,1970,1,1,0
3,163,5977,741,704,0,3,150,0,0,0.0,...,2016,10,3,1970,1,1,1970,1,1,0
4,657,5411,721,704,0,1,150,0,5000,0.0,...,2020,7,24,1970,1,1,1970,1,1,0


In [9]:
useful_features = [c for c in train.columns if ((c not in ['ID', 'MERCH_FR', 'BRANCH']) and ("DAY_" not in c))] 
# list_cat_org = get_list_cat_columns() 
# object_cols = [c for c in useful_features if c in list_cat_org]
object_cols = ['CURR_NUM', 'MP_IND', 'PAY_METHOD', 'HAS_TXN_LESS_15S']
num_cols = [c for c in useful_features if c not in object_cols]
# df_test = val[useful_features]

In [10]:
x_train = train[useful_features]
x_val = val[useful_features]
x_test = test[useful_features]
y_train = train["MERCH_FR"]
y_val = val["MERCH_FR"]
y_test = test["MERCH_FR"]

In [11]:
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, StandardScaler, OneHotEncoder
from imblearn.pipeline import make_pipeline, Pipeline

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import mean

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler() # ('scaler', MinMaxScaler()    
    )])

# cat_label_cols = ['CST_PERF_ST', 'CST_MKT_SEG']
cat_ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing_val')),
    ('encoder', OrdinalEncoder() 
    )])

# cat_onehot_cols = [f for f in cat_cols if f not in cat_label_cols]
cat_onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing_val')),
    ('encoder', OneHotEncoder(handle_unknown = 'ignore', sparse=False) 
    )])

# over sampling 
oversample = SMOTE() # ADASYN()
preprocessor = ColumnTransformer(
    transformers=[
                  ('num', numeric_transformer, num_cols), 
                  #('cat_label', cat_ordinal_transformer, cat_label_cols)
                  #('cat_onehot', cat_onehot_transformer, ['CLOSE_RSN','CHIP_YN']) # ['CLOSE_RSN', 'COMM_FREQ']
                 ], remainder='drop')

In [12]:
# preprocessor.fit(train[useful_features])
preprocessor.fit(x_train)

x_train_tf = pd.DataFrame(preprocessor.transform(x_train), columns = get_feature_names(preprocessor)) 
x_val_tf = pd.DataFrame(preprocessor.transform(x_val), columns = get_feature_names(preprocessor)) 
x_test_tf = pd.DataFrame(preprocessor.transform(x_test), columns = get_feature_names(preprocessor)) 



In [13]:
# get a list of base models
from numpy import hstack
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

def get_models():
	models = list()
	models.append(('svc', SVC(C=1000, gamma= 0.01, kernel='rbf')))
	models.append(('XGBClassifier', XGBClassifier(alpha=0.004016238677167677, colsample_bytree=0.3, subsample=0.6, learning_rate=0.02, max_depth=9, random_state=48, min_child_weight=2, reg_lambda=4.248186489909531)))
	models.append(('RandomForestClassifier', RandomForestClassifier(max_depth=9,max_leaf_nodes=8,min_samples_split=7, n_estimators=175)))
	models.append(('lr', LogisticRegression()))
	models.append(('bayes', GaussianNB()))
	return models
 
# fit the blending ensemble
def fit_ensemble(models, X_train, X_val, y_train, y_val):
	# fit all models on the training set and predict on hold out set
	meta_X = list()
	for name, model in models:
		# fit in training set
		model.fit(X_train, y_train)
		# predict on hold out set
		yhat = model.predict(X_val)
		# reshape predictions into a matrix with one column
		yhat = yhat.reshape(len(yhat), 1)
		# store predictions as input for blending
		meta_X.append(yhat)
	# create 2d array from predictions, each set is an input feature
	meta_X = hstack(meta_X)
	# define blending model
	blender = LogisticRegression()
	# fit on predictions from base models
	blender.fit(meta_X, y_val)
	return blender
 
# make a prediction with the blending ensemble
def predict_ensemble(models, blender, X_test):
	# make predictions with base models
	meta_X = list()
	for name, model in models:
		# predict with base model
		yhat = model.predict(X_test)
		# reshape predictions into a matrix with one column
		yhat = yhat.reshape(len(yhat), 1)
		# store prediction
		meta_X.append(yhat)
	# create 2d array from predictions, each set is an input feature
	meta_X = hstack(meta_X)
	# predict
	return blender.predict(meta_X)

In [14]:
# create the base models
models = get_models()
# train the blending ensemble
blender = fit_ensemble(models, x_train_tf, x_val_tf, y_train, y_val)
# make predictions on test set
yhat = predict_ensemble(models, blender, x_test_tf)
# evaluate predictions
score = accuracy_score(y_test, yhat)

print('Blending Accuracy: %.3f' % (score*100))

Blending Accuracy: 98.521


In [15]:
f1_score = f1_score(y_test, yhat)

print('Blending Accuracy: %.3f' % (f1_score*100))

Blending Accuracy: 97.797


In [16]:
from sklearn.metrics import classification_report
print(classification_report(y_test, yhat))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       225
           1       0.97      0.98      0.98       113

    accuracy                           0.99       338
   macro avg       0.98      0.98      0.98       338
weighted avg       0.99      0.99      0.99       338

