## General information

In this kernel I work with IEEE Fraud Detection competition.

EEE-CIS works across a variety of AI and machine learning areas, including deep neural networks, fuzzy systems, evolutionary computation, and swarm intelligence. Today they’re partnering with the world’s leading payment service company, Vesta Corporation, seeking the best solutions for fraud prevention industry, and now you are invited to join the challenge.

We have a binary classification problem with a heavy imbalance which is an inherent property of such problems.
At first I'll explore the data and try to find valuable insights, maybe I'll do some feature engineering and then it wil be time to build models.

![](https://cis.ieee.org/images/files/slideshow/abstract01.jpg)

*Work in progress*

In [1]:
import sys,os
__script_path=os.path.abspath(globals().get('__file__','.'))
__script_dir = os.path.dirname(__script_path)
root_dir = os.path.abspath(f'{__script_dir}/../..')
print(root_dir)
for lib in [root_dir][::-1]:
    if lib in sys.path:
        sys.path.remove(lib)
    sys.path.insert(0,lib)


c:\Users\Admin\Data\ads_fraud_detection


In [2]:
from config.config import *
from libs.common import *
init_notebook_mode(connected=True)
print(tf.__version__) # requires version >= 1.9.0

c:/Users/Admin/Data/ads_fraud_detection


2.13.0


In [3]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics
from sklearn import linear_model
import gc
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# import eli5
import shap
from IPython.display import HTML
import json
# import altair as alt

import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

# alt.renderers.enable('notebook')

%env JOBLIB_TEMP_FOLDER=/tmp

env: JOBLIB_TEMP_FOLDER=/tmp


In [4]:
import os
import time
import datetime
import json
import gc
from numba import jit

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn import metrics

from itertools import product

# import altair as alt
# from altair.vega import v5
from IPython.display import HTML

# using ideas from this kernel: https://www.kaggle.com/notslush/altair-visualization-2018-stackoverflow-survey
def prepare_altair():
    """
    Helper function to prepare altair for working.
    """

    vega_url = 'https://cdn.jsdelivr.net/npm/vega@' + v5.SCHEMA_VERSION
    vega_lib_url = 'https://cdn.jsdelivr.net/npm/vega-lib'
    vega_lite_url = 'https://cdn.jsdelivr.net/npm/vega-lite@' + alt.SCHEMA_VERSION
    vega_embed_url = 'https://cdn.jsdelivr.net/npm/vega-embed@3'
    noext = "?noext"
    
    paths = {
        'vega': vega_url + noext,
        'vega-lib': vega_lib_url + noext,
        'vega-lite': vega_lite_url + noext,
        'vega-embed': vega_embed_url + noext
    }
    
    workaround = f"""    requirejs.config({{
        baseUrl: 'https://cdn.jsdelivr.net/npm/',
        paths: {paths}
    }});
    """
    
    return workaround
    

def add_autoincrement(render_func):
    # Keep track of unique <div/> IDs
    cache = {}
    def wrapped(chart, id="vega-chart", autoincrement=True):
        if autoincrement:
            if id in cache:
                counter = 1 + cache[id]
                cache[id] = counter
            else:
                cache[id] = 0
            actual_id = id if cache[id] == 0 else id + '-' + str(cache[id])
        else:
            if id not in cache:
                cache[id] = 0
            actual_id = id
        return render_func(chart, id=actual_id)
    # Cache will stay outside and 
    return wrapped
           

@add_autoincrement
def render(chart, id="vega-chart"):
    """
    Helper function to plot altair visualizations.
    """
    chart_str = """
    <div id="{id}"></div><script>
    require(["vega-embed"], function(vg_embed) {{
        const spec = {chart};     
        vg_embed("#{id}", spec, {{defaultStyle: true}}).catch(console.warn);
        console.log("anything?");
    }});
    console.log("really...anything?");
    </script>
    """
    return HTML(
        chart_str.format(
            id=id,
            chart=json.dumps(chart) if isinstance(chart, dict) else chart.to_json(indent=None)
        )
    )
    

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                c_prec = df[col].apply(lambda x: np.finfo(x).precision).max()
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max and c_prec == np.finfo(np.float32).precision:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
    

@jit
def fast_auc(y_true, y_prob):
    """
    fast roc_auc computation: https://www.kaggle.com/c/microsoft-malware-prediction/discussion/76013
    """
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc


def eval_auc(y_true, y_pred):
    """
    Fast auc eval function for lgb.
    """
    return 'auc', fast_auc(y_true, y_pred), True


def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    """
    Fast metric computation for this competition: https://www.kaggle.com/c/champs-scalar-coupling
    Code is from this kernel: https://www.kaggle.com/uberkinder/efficient-metric
    """
    maes = (y_true-y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()
    

def train_model_regression(X, X_test, y, params, folds=None, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=False, model=None,
                               verbose=10000, early_stopping_rounds=200, n_estimators=50000, splits=None, n_folds=3):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.
    
    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type
    
    """
    columns = X.columns if columns is None else columns
    X_test = X_test[columns]
    splits = folds.split(X) if splits is None else splits
    n_splits = folds.n_splits if splits is None else n_folds
    
    # to set up scoring parameters
    metrics_dict = {'mae': {'lgb_metric_name': 'mae',
                        'catboost_metric_name': 'MAE',
                        'sklearn_scoring_function': metrics.mean_absolute_error},
                    'group_mae': {'lgb_metric_name': 'mae',
                        'catboost_metric_name': 'MAE',
                        'scoring_function': group_mean_log_mae},
                    'mse': {'lgb_metric_name': 'mse',
                        'catboost_metric_name': 'MSE',
                        'sklearn_scoring_function': metrics.mean_squared_error}
                    }

    
    result_dict = {}
    
    # out-of-fold predictions on train data
    oof = np.zeros(len(X))
    
    # averaged predictions on train data
    prediction = np.zeros(len(X_test))
    
    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()
    
    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(splits):
        if verbose:
            print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params, n_estimators = n_estimators, n_jobs = -1)
            model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                    verbose=verbose, early_stopping_rounds=early_stopping_rounds)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=verbose, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
        
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')
            
            y_pred = model.predict(X_test).reshape(-1,)
        
        if model_type == 'cat':
            model = CatBoostRegressor(iterations=20000,  eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params,
                                      loss_function=metrics_dict[eval_metric]['catboost_metric_name'])
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        if eval_metric != 'group_mae':
            scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid))
        else:
            scores.append(metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type']))

        prediction += y_pred    
        
        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_splits
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores
    
    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= n_splits
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
            
            result_dict['feature_importance'] = feature_importance
        
    return result_dict
    


def train_model_classification(X, X_test, y, params, folds, model_type='lgb', eval_metric='auc', columns=None, plot_feature_importance=False, model=None,
                               verbose=10000, early_stopping_rounds=200, n_estimators=50000, splits=None, n_folds=3, averaging='usual', n_jobs=-1):
    """
    A function to train a variety of classification models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.
    
    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type
    
    """
    columns = X.columns if columns is None else columns
    n_splits = folds.n_splits if splits is None else n_folds
    X_test = X_test[columns]
    
    # to set up scoring parameters
    metrics_dict = {'auc': {'lgb_metric_name': eval_auc,
                        'catboost_metric_name': 'AUC',
                        'sklearn_scoring_function': metrics.roc_auc_score},
                    }
    
    result_dict = {}
    if averaging == 'usual':
        # out-of-fold predictions on train data
        oof = np.zeros((len(X), 1))

        # averaged predictions on train data
        prediction = np.zeros((len(X_test), 1))
        
    elif averaging == 'rank':
        # out-of-fold predictions on train data
        oof = np.zeros((len(X), 1))

        # averaged predictions on train data
        prediction = np.zeros((len(X_test), 1))

    
    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()
    
    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
        if model_type == 'lgb':
            model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs = n_jobs)
            model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                    verbose=verbose, early_stopping_rounds=early_stopping_rounds)
            
            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_)[:, 1]
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=n_estimators, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
        
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')
            
            y_pred = model.predict_proba(X_test)
        
        if model_type == 'cat':
            model = CatBoostClassifier(iterations=n_estimators, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params,
                                      loss_function=Logloss)
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
        
        if averaging == 'usual':
            
            oof[valid_index] = y_pred_valid.reshape(-1, 1)
            scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid))
            
            prediction += y_pred.reshape(-1, 1)

        elif averaging == 'rank':
                                  
            oof[valid_index] = y_pred_valid.reshape(-1, 1)
            scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid))
                                  
            prediction += pd.Series(y_pred).rank().values.reshape(-1, 1)        
        
        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_splits
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores
    
    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= n_splits
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
            
            result_dict['feature_importance'] = feature_importance
            result_dict['top_columns'] = cols
        
    return result_dict

# setting up altair
# workaround = prepare_altair()
# HTML("".join((
#     "<script>",
#     workaround,
#     "</script>",
# )))

In [5]:
save_dir=f"{exps_dir}/exp3"
if os.path.exists(save_dir) == False: 
  os.makedirs(save_dir, exist_ok=True)



In [6]:
from fdb.datasets import FraudDatasetBenchmark

# all_keys = ['fakejob', 'vehicleloan', 'malurl', 'ieeecis', 'ccfraud', 'fraudecom', 'twitterbot', 'ipblock'] 
key = 'ieeecis'

obj = FraudDatasetBenchmark(
    key=key,
    load_pre_downloaded=False,  # default
    delete_downloaded=True,  # default
    add_random_values_if_real_na = { 
        "EVENT_TIMESTAMP": True, 
        "LABEL_TIMESTAMP": True,
        "ENTITY_ID": True,
        "ENTITY_TYPE": True,
        "ENTITY_ID": True,
        "EVENT_ID": True
        } # default
    )
print(obj.key)

print('Train set: ')
display(obj.train.head())
print(len(obj.train.columns))
print(obj.train.shape)

print('Test set: ')
display(obj.test.head())
print(obj.test.shape)

print('Test scores')
display(obj.test_labels.head())
print(obj.test_labels['EVENT_LABEL'].value_counts())
print(obj.train['EVENT_LABEL'].value_counts(normalize=True))
print('=========')


Data download location c:\Users\Admin\Data\ads_fraud_detection\src\prj3\1.EDA\tmp
ieeecis
Train set: 


Unnamed: 0_level_0,EVENT_LABEL,transactionamt,productcd,card1,card2,card3,card5,card6,addr1,dist1,p_emaildomain,r_emaildomain,c1,c2,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,v62,v70,v76,v78,v82,v91,v127,v130,v139,v160,v165,v187,v203,v207,v209,v210,v221,v234,v257,v258,v261,v264,v266,v267,v271,v274,v277,v283,v285,v289,v291,v294,id_01,id_02,id_05,id_06,id_09,id_13,id_17,id_19,id_20,devicetype,deviceinfo,EVENT_ID,ENTITY_ID,EVENT_TIMESTAMP,LABEL_TIMESTAMP,ENTITY_TYPE
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1
2987000.0,0,68.5,W,13926.0,,150.0,142.0,credit,315.0,19.0,,,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,117.0,0.0,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,1.0,1.0,,,,,,,,,,,,128628aa-5f6c-428f-b2b1-2f72fe65e93d,13926.0_315.0_-13.0,2021-01-02T00:00:00Z,2025-02-28T16:44:34Z,user
2987001.0,0,29.0,W,2755.0,404.0,150.0,102.0,credit,325.0,,gmail.com,,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,1.0,0.0,,,,,,,,,,,,9dfd05d3-3c80-4d4d-95d4-02dfa8b89666,2755.0_325.0_1.0,2021-01-02T00:00:01Z,2025-02-28T16:44:34Z,user
2987002.0,0,59.0,W,4663.0,490.0,150.0,166.0,debit,330.0,287.0,outlook.com,,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,1.0,0.0,,,,,,,,,,,,2c596841-44ee-47b6-88af-39e3af7d1906,4663.0_330.0_1.0,2021-01-02T00:01:09Z,2025-02-28T16:44:34Z,user
2987003.0,0,50.0,W,18132.0,567.0,150.0,117.0,debit,476.0,,yahoo.com,,2.0,5.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1758.0,354.0,,,,,,,,,,,,,,,,,,,,0.0,10.0,0.0,1.0,38.0,,,,,,,,,,,,668b14fb-5649-459f-aa80-062cc9327d4f,18132.0_476.0_-111.0,2021-01-02T00:01:39Z,2025-02-28T16:44:34Z,user
2987004.0,0,50.0,H,4497.0,514.0,150.0,102.0,credit,420.0,,gmail.com,,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,,,,,,,0.0,0.0,0.0,169690.796875,5155.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,70787.0,,,,,166.0,542.0,144.0,mobile,SAMSUNG SM-G892A Build/NRD90M,4f620483-6e1a-4496-8e82-a472ac538bf4,4497.0_420.0_1.0,2021-01-02T00:01:46Z,2025-02-28T16:44:34Z,user


73
(561013, 73)
Test set: 


Unnamed: 0_level_0,transactionamt,productcd,card1,card2,card3,card5,card6,addr1,dist1,p_emaildomain,r_emaildomain,c1,c2,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,v62,v70,v76,v78,v82,v91,v127,v130,v139,v160,v165,v187,v203,v207,v209,v210,v221,v234,v257,v258,v261,v264,v266,v267,v271,v274,v277,v283,v285,v289,v291,v294,id_01,id_02,id_05,id_06,id_09,id_13,id_17,id_19,id_20,devicetype,deviceinfo,EVENT_ID,ENTITY_ID,EVENT_TIMESTAMP,ENTITY_TYPE
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1
3548013.0,125.0,S,15775.0,481.0,150.0,102.0,credit,330.0,,,yahoo.com,5.0,3.0,3.0,0.0,0.0,0.0,8.0,0.0,3.0,5.0,0.0,61.0,5.0,0.0,0.0,,,,,109411.0,2301.0,0.0,2401.0,66104.0,1.0,103183.0,877.0,1961.0,465.0,0.0,73.0,,,,,,,0.0,,,1.0,26.0,1.0,2.0,926.0,-10.0,1411.0,6.0,0.0,0.0,52.0,166.0,633.0,533.0,desktop,Windows,59f9c629-1b64-4bfe-b092-b6df63e82d38,15775.0_330.0_129.0,2021-06-21T23:11:15Z,user
3548014.0,125.0,S,15775.0,481.0,150.0,102.0,credit,330.0,,,yahoo.com,5.0,3.0,3.0,0.0,0.0,0.0,8.0,0.0,3.0,5.0,0.0,61.0,5.0,0.0,0.0,,,,,109536.0,2301.0,0.0,2401.0,66229.0,1.0,103308.0,877.0,1961.0,465.0,0.0,73.0,,,,,,,0.0,,,1.0,26.0,1.0,2.0,927.0,-10.0,693.0,6.0,0.0,0.0,52.0,166.0,633.0,533.0,desktop,Windows,047cecff-0205-4306-b615-5fa9c110dc72,15775.0_330.0_129.0,2021-06-21T23:11:29Z,user
3548015.0,125.0,S,15775.0,481.0,150.0,102.0,credit,330.0,,,yahoo.com,5.0,3.0,3.0,0.0,0.0,0.0,8.0,0.0,3.0,5.0,0.0,61.0,5.0,0.0,0.0,,,,,109661.0,2301.0,0.0,2401.0,66354.0,1.0,103433.0,877.0,1961.0,465.0,0.0,73.0,,,,,,,0.0,,,1.0,26.0,1.0,2.0,928.0,-10.0,1116.0,6.0,0.0,0.0,52.0,166.0,633.0,533.0,desktop,Windows,03cb5cb8-a560-47d1-b1dc-914062736c1b,15775.0_330.0_129.0,2021-06-21T23:11:45Z,user
3548016.0,125.0,S,15775.0,481.0,150.0,102.0,credit,330.0,,,yahoo.com,5.0,3.0,3.0,0.0,0.0,0.0,8.0,0.0,3.0,5.0,0.0,61.0,5.0,0.0,0.0,,,,,109786.0,2301.0,0.0,2401.0,66479.0,1.0,103558.0,877.0,1961.0,465.0,0.0,73.0,,,,,,,0.0,,,1.0,26.0,1.0,2.0,929.0,-10.0,1589.0,6.0,0.0,0.0,52.0,166.0,633.0,533.0,desktop,Windows,c5d38cd3-86ed-46b5-b47b-cc6b13473de6,15775.0_330.0_129.0,2021-06-21T23:12:00Z,user
3548017.0,31.950000762939453,W,9500.0,321.0,150.0,226.0,debit,204.0,74.0,,,3.0,3.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,6.0,3.0,1.0,1.0,1.0,2.0,1.0,1.0,27.950000762939453,27.950000762939453,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,,,,,,,,,,,,444899cf-57be-4d5f-9c33-285054830ad2,9500.0_204.0_150.0,2021-06-21T23:12:11Z,user


(29527, 71)
Test scores


Unnamed: 0_level_0,EVENT_LABEL,EVENT_ID
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1
3548013.0,0,59f9c629-1b64-4bfe-b092-b6df63e82d38
3548014.0,0,047cecff-0205-4306-b615-5fa9c110dc72
3548015.0,0,03cb5cb8-a560-47d1-b1dc-914062736c1b
3548016.0,0,c5d38cd3-86ed-46b5-b47b-cc6b13473de6
3548017.0,0,444899cf-57be-4d5f-9c33-285054830ad2


EVENT_LABEL
0    28358
1     1169
Name: count, dtype: int64
EVENT_LABEL
0    0.965252142107224
1    0.034747857892776
Name: proportion, dtype: float64


In [7]:
# train_identity = pd.read_csv(f'{data_dir}/ieee-fraud-detection/train_identity.csv')
# train_transaction = pd.read_csv(f'{data_dir}/ieee-fraud-detection/train_transaction.csv')
# test_identity = pd.read_csv(f'{data_dir}/ieee-fraud-detection/test_identity.csv')
# test_transaction = pd.read_csv(f'{data_dir}/ieee-fraud-detection/test_transaction.csv')
# sub = pd.read_csv(f'{data_dir}/ieee-fraud-detection/sample_submission.csv')
# # let's combine the data and work with the whole dataset
# train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
# test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

In [8]:
test = obj.test
test = pd.concat([test, obj.test_labels], axis=1)
train = obj.train
train.drop(columns=['LABEL_TIMESTAMP'], inplace=True) 
print(train.shape, test.shape)
  


(561013, 72) (29527, 73)


In [9]:
train_cols = set(train.columns)
test_cols = set(test.columns)

# Cột có trong train nhưng không có trong test
missing_in_test = train_cols - test_cols

# Cột có trong test nhưng không có trong train
missing_in_train = test_cols - train_cols

# In kết quả
if missing_in_test:
    print("Cột có trong train nhưng thiếu trong test:", missing_in_test)
else:
    print("Không có cột nào bị thiếu trong test.")

if missing_in_train:
    print("Cột có trong test nhưng thiếu trong train:", missing_in_train)
else:
    print("Không có cột nào bị thiếu trong train.")


Không có cột nào bị thiếu trong test.
Không có cột nào bị thiếu trong train.


In [10]:
print(f'Train dataset has {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'Test dataset has {test.shape[0]} rows and {test.shape[1]} columns.')

Train dataset has 561013 rows and 72 columns.
Test dataset has 29527 rows and 73 columns.


In [11]:
print(f'There are {train.isnull().any().sum()} columns in train dataset with missing values.')

There are 51 columns in train dataset with missing values.


In [12]:
# train.columns = [col.upper() for col in train.columns]
# test.columns = [col.upper() for col in test.columns]

# train.rename(columns={"EVENT_LABEL": "LOAN_DEFAULT"}, inplace=True)
# test.rename(columns={"EVENT_LABEL": "LOAN_DEFAULT"}, inplace=True)
# # train.rename(columns={"ENTITY_ID": "UNIQUEID"}, inplace=True)
# # test.rename(columns={"ENTITY_ID": "UNIQUEID"}, inplace=True)

# print(train.columns)
# print(test.columns)


In [13]:
# import pandas as pd

# # Thay NaN bằng chuỗi trống để tránh lỗi khi ép kiểu string
# one_value_cols_train = [col for col in train.columns if train[col].fillna("").astype(str).nunique() <= 1]
# one_value_cols_test = [col for col in test.columns if test[col].fillna("").astype(str).nunique() <= 1]

# # So sánh hai danh sách
# print(set(one_value_cols_train) == set(one_value_cols_test))


In [14]:
# print(f'There are {len(one_value_cols)} columns in train dataset with one unique value.')
# print(f'There are {len(one_value_cols_test)} columns in test dataset with one unique value.')

In [15]:
# plt.hist(train['id_01'], bins=77);
# plt.title('Distribution of id_01 variable');

In [16]:
# train['id_03'].value_counts(dropna=False, normalize=True).head()

In [17]:
train.columns

Index(['EVENT_LABEL', 'transactionamt', 'productcd', 'card1', 'card2', 'card3', 'card5', 'card6', 'addr1', 'dist1', 'p_emaildomain', 'r_emaildomain', 'c1', 'c2', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'v62', 'v70', 'v76', 'v78', 'v82', 'v91', 'v127', 'v130', 'v139', 'v160', 'v165', 'v187', 'v203', 'v207', 'v209', 'v210', 'v221', 'v234', 'v257', 'v258', 'v261', 'v264', 'v266', 'v267', 'v271', 'v274', 'v277', 'v283', 'v285', 'v289', 'v291', 'v294', 'id_01', 'id_02', 'id_05', 'id_06', 'id_09', 'id_13', 'id_17', 'id_19', 'id_20', 'devicetype', 'deviceinfo', 'EVENT_ID', 'ENTITY_ID', 'EVENT_TIMESTAMP', 'ENTITY_TYPE'], dtype='object')

In [18]:
train['transactionamt_to_mean_card1'] = train['transactionamt'] / train.groupby(['card1'])['transactionamt'].transform('mean')
train['transactionamt_to_mean_card2'] = train['transactionamt'] / train.groupby(['card2'])['transactionamt'].transform('mean')
train['transactionamt_to_std_card1'] = train['transactionamt'] / train.groupby(['card1'])['transactionamt'].transform('std')
train['transactionamt_to_mean_card3'] = train['transactionamt'] / train.groupby(['card3'])['transactionamt'].transform('mean')
train['transactionamt_to_std_card3'] = train['transactionamt'] / train.groupby(['card3'])['transactionamt'].transform('std')
train['transactionamt_to_mean_card5'] = train['transactionamt'] / train.groupby(['card5'])['transactionamt'].transform('mean')
train['transactionamt_to_std_card5'] = train['transactionamt'] / train.groupby(['card5'])['transactionamt'].transform('std')
train['transactionamt_to_mean_card6'] = train['transactionamt'] / train.groupby(['card6'])['transactionamt'].transform('mean')
train['transactionamt_to_std_card6'] = train['transactionamt'] / train.groupby(['card6'])['transactionamt'].transform('std')
train['transactionamt_to_std_card2'] = train['transactionamt'] / train.groupby(['card2'])['transactionamt'].transform('std')

test['transactionamt_to_mean_card1'] = test['transactionamt'] / test.groupby(['card1'])['transactionamt'].transform('mean')
test['transactionamt_to_mean_card2'] = test['transactionamt'] / test.groupby(['card2'])['transactionamt'].transform('mean')
test['transactionamt_to_std_card1'] = test['transactionamt'] / test.groupby(['card1'])['transactionamt'].transform('std')
test['transactionamt_to_mean_card3'] = test['transactionamt'] / test.groupby(['card3'])['transactionamt'].transform('mean')
test['transactionamt_to_std_card3'] = test['transactionamt'] / test.groupby(['card3'])['transactionamt'].transform('std')
test['transactionamt_to_mean_card5'] = test['transactionamt'] / test.groupby(['card5'])['transactionamt'].transform('mean')
test['transactionamt_to_std_card5'] = test['transactionamt'] / test.groupby(['card5'])['transactionamt'].transform('std')
test['transactionamt_to_mean_card6'] = test['transactionamt'] / test.groupby(['card6'])['transactionamt'].transform('mean')
test['transactionamt_to_std_card6'] = test['transactionamt'] / test.groupby(['card6'])['transactionamt'].transform('std')
test['transactionamt_to_std_card2'] = test['transactionamt'] / test.groupby(['card2'])['transactionamt'].transform('std')


train['id_02_to_mean_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('mean')
train['id_02_to_std_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('std')
train['id_02_to_mean_card2'] = train['id_02'] / train.groupby(['card2'])['id_02'].transform('mean')
train['id_02_to_std_card2'] = train['id_02'] / train.groupby(['card2'])['id_02'].transform('std')
train['id_02_to_mean_card3'] = train['id_02'] / train.groupby(['card3'])['id_02'].transform('mean')
train['id_02_to_std_card3'] = train['id_02'] / train.groupby(['card3'])['id_02'].transform('std')
train['id_02_to_mean_card5'] = train['id_02'] / train.groupby(['card5'])['id_02'].transform('mean')
train['id_02_to_std_card5'] = train['id_02'] / train.groupby(['card5'])['id_02'].transform('std')
train['id_02_to_mean_card6'] = train['id_02'] / train.groupby(['card6'])['id_02'].transform('mean')
train['id_02_to_std_card6'] = train['id_02'] / train.groupby(['card6'])['id_02'].transform('std')


test['id_02_to_mean_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('mean')
test['id_02_to_std_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('std')
test['id_02_to_mean_card2'] = test['id_02'] / test.groupby(['card2'])['id_02'].transform('mean')
test['id_02_to_std_card2'] = test['id_02'] / test.groupby(['card2'])['id_02'].transform('std')
test['id_02_to_mean_card3'] = test['id_02'] / test.groupby(['card3'])['id_02'].transform('mean')
test['id_02_to_std_card3'] = test['id_02'] / test.groupby(['card3'])['id_02'].transform('std')
test['id_02_to_mean_card5'] = test['id_02'] / test.groupby(['card5'])['id_02'].transform('mean')
test['id_02_to_std_card5'] = test['id_02'] / test.groupby(['card5'])['id_02'].transform('std')
test['id_02_to_mean_card6'] = test['id_02'] / test.groupby(['card6'])['id_02'].transform('mean')
test['id_02_to_std_card6'] = test['id_02'] / test.groupby(['card6'])['id_02'].transform('std')





In [19]:
train[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train['p_emaildomain'].str.split('.', expand=True)
train[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = train['r_emaildomain'].str.split('.', expand=True)
test[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = test['p_emaildomain'].str.split('.', expand=True)
test[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = test['r_emaildomain'].str.split('.', expand=True)

In [20]:
train.to_csv(f'{save_dir}/train_EDA.csv', index=None)
test.to_csv(f'{save_dir}/test_EDA.csv', index=None)

