In [1]:
import pandas as pd
import numpy as np
import src.utils as utils

from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer

# Load Config File

In [50]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/data.csv',
 'data_set_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_columns_path': 'data/output/input_columns.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'output_column': 'Class',
 'seed': 42,
 'test_size': 0.2,
 'standardizer_path': 'data/output/standardizer.pkl',
 'preprocessor_path': 'data/output/preprocessor.pkl',
 'train_clean_path': ['data/output/X_train_clean.pkl',
  'data/output/y_train_clean.pkl'],
 'valid_clean_path': ['data/output/X_valid_clean.pkl',
  'data/output/y_valid_clean.pkl'],
 'test_clean_path': ['data/output/X_test_clean.pkl',
  'data/output/y_test_clean.pkl']}

# Plan Preprocessing

**Summary EDA**:
- No missing value
- Features are uncorellated
- Scaling for Time and Amount Features

# Load Dataset

In [51]:
def load_dataset(return_file=True):
    # Load train data
    X_train = utils.pickle_load(CONFIG_DATA['train_set_path'][0])
    y_train = utils.pickle_load(CONFIG_DATA['train_set_path'][1])

    # Load valid data
    X_valid = utils.pickle_load(CONFIG_DATA['valid_set_path'][0])
    y_valid = utils.pickle_load(CONFIG_DATA['valid_set_path'][1])

    # Load test data
    X_test = utils.pickle_load(CONFIG_DATA['test_set_path'][0])
    y_test = utils.pickle_load(CONFIG_DATA['test_set_path'][1])

    # Print 
    print("X_train shape :", X_train.shape)
    print("y_train shape :", y_train.shape)
    print("X_valid shape :", X_valid.shape)
    print("y_valid shape :", y_valid.shape)
    print("X_test shape  :", X_test.shape)
    print("y_test shape  :", y_test.shape)

    if return_file:
        return X_train, X_valid, X_test, y_train, y_valid, y_test

In [52]:
X_train, X_valid, X_test, y_train, y_valid, y_test = load_dataset()

X_train shape : (182276, 30)
y_train shape : (182276,)
X_valid shape : (45569, 30)
y_valid shape : (45569,)
X_test shape  : (56962, 30)
y_test shape  : (56962,)


# Preprocess Train

Scale the data

In [53]:
def fit_standardize(data, return_file=True, columns=['Time', 'Amount']):
    """Find standardizer data"""
    standardizer = RobustScaler()

    # Fit standardizer
    standardizer.fit(data[columns])

    # Dump standardizer
    utils.pickle_dump(standardizer, CONFIG_DATA['standardizer_path'])
    
    if return_file:
        return standardizer

In [54]:
# Fit standardizer
standardizer = fit_standardize(data=X_train)

In [55]:
def transform_standardize(data, standardizer, columns=['Time', 'Amount']):
    """Function to standardize data"""
    data_standard = pd.DataFrame(standardizer.transform(data[columns]))
    data_standard.index = data.index
    data[columns] = data_standard
    return data


In [56]:
# Transform
X_train_std = transform_standardize(data = X_train,
                                    standardizer = standardizer)

In [57]:
X_train_std

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
272615,0.945197,-3.017333,3.215950,-2.844590,-1.341856,-0.491730,-1.776197,0.071360,1.470371,0.070087,...,0.338884,0.258544,0.843397,0.086503,0.026228,0.082502,0.112516,0.618475,0.454782,-0.298344
191231,0.522245,2.099809,-0.890100,-2.817319,-1.208673,0.845043,0.138699,0.064751,-0.009282,-0.974776,...,-0.196549,0.567082,1.621804,-0.242970,-1.581675,0.522156,0.415164,-0.070427,-0.118390,0.366002
53595,-0.453943,0.812352,-0.586909,-0.667514,0.962864,0.401248,0.721682,0.343162,0.121239,0.059051,...,0.259607,0.084476,-0.144437,-0.487009,-1.312940,0.806243,-0.202556,-0.027304,0.024049,2.934065
193549,0.534096,-2.621263,-4.439432,-2.595440,-1.117193,2.489633,-2.625322,1.207772,-0.457577,-1.353822,...,2.078601,1.265437,1.872310,1.862566,0.615607,-0.594514,-0.172620,0.199639,0.485647,7.963850
207723,0.611959,2.227359,-1.572316,-0.371772,-1.578679,-1.593467,-0.157863,-1.598295,0.004422,-1.074622,...,-0.362887,0.089499,0.852866,0.098889,-0.305689,-0.178513,0.009100,0.045008,-0.053145,-0.028132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131478,-0.059815,-0.877913,-0.301831,2.735473,1.253404,-1.145942,0.068037,-0.076945,0.179777,0.601691,...,0.478435,0.264666,0.931641,0.143916,1.033013,-0.117342,0.753320,0.104314,0.152018,1.568661
239193,0.766897,-1.662279,-0.278422,2.677875,1.479724,-0.641821,0.421010,-1.162016,0.946243,0.705521,...,0.190299,0.318863,0.747940,-0.328271,-0.004766,0.429288,-0.220109,0.057000,-0.105868,0.223934
67705,-0.376363,-1.061497,0.978902,1.629268,-1.385857,-0.074805,-1.054468,0.893588,-0.304963,0.396304,...,0.406107,-0.191385,-0.143821,-0.080626,0.448429,-0.230180,0.697131,0.306224,0.115183,0.042480
233557,0.738836,2.081836,-0.128730,-1.497688,0.049290,0.463113,-0.260199,0.090636,-0.139337,0.305002,...,-0.142484,-0.300450,-0.762338,0.183893,-1.088678,-0.175748,0.243605,-0.075166,-0.079110,-0.182720


In [58]:
X_train_std.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
count,182276.0,182276.0,182276.0,182276.0,182276.0,182276.0,182276.0,182276.0,182276.0,182276.0,...,182276.0,182276.0,182276.0,182276.0,182276.0,182276.0,182276.0,182276.0,182276.0,182276.0
mean,0.118173,-0.000496,0.00062,-0.001459,-9.3e-05,-0.000414,0.001552,-0.001042,-0.002027,0.000986,...,0.001038,-0.000339,-0.000137,0.000653,0.000357,-0.001881,3.8e-05,-0.00038,-0.000129,0.934029
std,0.557778,1.968199,1.661023,1.524433,1.41863,1.398726,1.341856,1.263245,1.213241,1.098672,...,0.775282,0.743648,0.725926,0.618122,0.604849,0.522371,0.482101,0.404368,0.330388,3.568911
min,-0.994952,-56.40751,-72.715728,-48.325589,-5.683171,-113.743307,-26.160506,-43.557242,-73.216718,-13.320155,...,-54.49772,-34.830382,-10.933144,-36.666,-2.836627,-7.495741,-2.53433,-9.895244,-15.430084,-0.309175
25%,-0.359214,-0.917797,-0.597934,-0.890934,-0.848321,-0.689116,-0.767962,-0.553619,-0.208026,-0.641897,...,-0.21185,-0.228869,-0.541842,-0.161245,-0.353721,-0.31915,-0.327143,-0.070992,-0.053194,-0.229841
50%,0.0,0.017078,0.067194,0.179143,-0.021268,-0.054961,-0.273748,0.039914,0.022449,-0.049805,...,-0.06263,-0.029942,0.006426,-0.010764,0.040949,0.014265,-0.051215,0.001272,0.01133,0.0
75%,0.640786,1.315437,0.804057,1.02577,0.741525,0.612443,0.399529,0.570656,0.32763,0.599647,...,0.133595,0.186167,0.529241,0.148003,0.439526,0.350675,0.240569,0.091125,0.078348,0.770159
max,1.034606,2.45493,22.057729,4.101716,16.715537,34.801666,73.301626,120.589494,20.007208,10.392889,...,39.420904,27.202839,10.50309,22.528412,4.584549,5.852484,3.517346,31.612198,33.847808,361.067342


Balancing Data

In [59]:
# See that the data is unbalanced
y_train.value_counts(normalize=True)

0    0.99819
1    0.00181
Name: Class, dtype: float64

We will do the downsampling (only for training data)

In [60]:
# !pip install imblearn

In [61]:
from imblearn.under_sampling import RandomUnderSampler

In [62]:
def random_undersampler(X, y):
    """Function to under sample the majority data"""
    # Create resampling object
    ros = RandomUnderSampler(random_state = CONFIG_DATA['seed'])

    # Balancing the set data
    X_resample, y_resample = ros.fit_resample(X, y)

    # Print
    print('Distribution before resampling :')
    print(y.value_counts())
    print("")
    print('Distribution after resampling  :')
    print(y_resample.value_counts())

    return X_resample, y_resample

In [63]:
X_train_clean, y_train_clean = random_undersampler(X_train_std, y_train)

Distribution before resampling :
0    181946
1       330
Name: Class, dtype: int64

Distribution after resampling  :
0    330
1    330
Name: Class, dtype: int64


Drop all preprocessor

In [64]:
preprocessor = {
    'standardizer': standardizer
}

utils.pickle_dump(preprocessor, CONFIG_DATA['preprocessor_path'])

# Preprocess all

In [65]:
def clean_data(data,  standardizer):
    """Function to clean data"""

    # Standardize data
    data_standard = transform_standardize(data, standardizer)

    return data_standard

In [66]:
def _preprocess_data(data):
    """Function to preprocess data"""
    # Load preprocessor
    preprocessor = utils.pickle_load(CONFIG_DATA['preprocessor_path'])
    standardizer = preprocessor['standardizer']

    data_clean = clean_data(data,
                            standardizer)
    
    return data_clean

# Preprocess all

Generate preprocessor

In [67]:
def generate_preprocessor(return_file=True):
    """Function to generate preprocessor"""
    # Load data
    X = utils.pickle_load(CONFIG_DATA['train_set_path'][0])
    y = utils.pickle_load(CONFIG_DATA['train_set_path'][1])

    # Generate preprocessor: standardizer
    standardizer = fit_standardize(X)

    # Dump file
    preprocessor = {
        'standardizer': standardizer
    }
    utils.pickle_dump(preprocessor, CONFIG_DATA['preprocessor_path'])
    
    if return_file:
        return preprocessor
    

In [68]:
preprocessor = generate_preprocessor()

For X train

In [69]:
def preprocess_data(type='train', return_file=True):
    """Function to preprocess train data"""
    # Load data
    X = utils.pickle_load(CONFIG_DATA[f'{type}_set_path'][0])
    y = utils.pickle_load(CONFIG_DATA[f'{type}_set_path'][1])
        
    # Preprocess data
    X_clean = _preprocess_data(X)
    y_clean = y

    # FOR TRAINING ONLY -> DO UNDERSAMPLING
    if type == 'train':
        X_clean, y_clean = random_undersampler(X_clean, y_clean)

    # Print shape
    print("X clean shape:", X_clean.shape)
    print("y clean shape:", y_clean.shape)

    # Dump file
    utils.pickle_dump(X_clean, CONFIG_DATA[f'{type}_clean_path'][0])
    utils.pickle_dump(y_clean, CONFIG_DATA[f'{type}_clean_path'][1])

    if return_file:
        return X_clean, y_clean    

In [70]:
# Transform X_train
X_train_clean, y_train_clean = preprocess_data(type = 'train')

Distribution before resampling :
0    181946
1       330
Name: Class, dtype: int64

Distribution after resampling  :
0    330
1    330
Name: Class, dtype: int64
X clean shape: (660, 30)
y clean shape: (660,)


In [71]:
# Transform X_valid
X_valid_clean, y_valid_clean = preprocess_data(type = 'valid')

X clean shape: (45569, 30)
y clean shape: (45569,)


In [72]:
# Transform X_test
X_test_clean, y_test_clean = preprocess_data(type = 'test')

X clean shape: (56962, 30)
y clean shape: (56962,)


In [73]:
X_train_clean.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,...,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,0.038514,-2.387756,1.885442,-3.583184,2.237929,-1.544177,-0.677305,-2.84745,0.327506,-1.313329,...,0.184407,0.365218,-0.002914,-0.032933,-0.060062,0.01495,0.05544,0.081707,0.027006,1.049348
std,0.575477,5.458339,3.586012,6.221344,3.269201,4.217382,1.789254,5.868419,4.75981,2.352173,...,1.02852,2.745772,1.143233,1.275867,0.578009,0.697991,0.475536,1.037127,0.438481,3.128954
min,-0.993155,-30.55238,-8.402154,-31.103685,-3.690732,-22.105532,-6.406267,-43.557242,-41.044261,-13.320155,...,-4.128186,-22.797604,-8.887017,-19.254328,-2.069044,-4.781606,-1.149923,-7.263482,-2.497253,-0.309175
25%,-0.469612,-2.788129,-0.129752,-5.344561,-0.266842,-1.890041,-1.625389,-3.152655,-0.173673,-2.250351,...,-0.202201,-0.17559,-0.541429,-0.251073,-0.402638,-0.314386,-0.260445,-0.06776,-0.066925,-0.286247
50%,-0.072272,-0.809079,1.029414,-1.367758,1.278598,-0.417002,-0.617747,-0.676872,0.208383,-0.701103,...,0.02413,0.12474,-0.004375,-0.030624,0.000484,0.058284,0.010535,0.043815,0.030451,-0.0659
75%,0.590229,1.023998,2.9087,0.219726,4.374232,0.602139,0.045117,0.22136,0.895188,0.118295,...,0.427775,0.655246,0.562087,0.188441,0.384534,0.392876,0.352495,0.443936,0.18876,1.075922
max,1.024963,2.313804,22.057729,2.996757,12.114672,11.095089,6.474115,5.431271,20.007208,8.009215,...,11.059004,27.202839,8.316275,5.46623,1.091435,2.208209,2.745261,3.599204,2.081454,29.593698


In [74]:
X_train_clean.columns.tolist()

['Time',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount']