In [None]:
# ! pip install pandas seaborn scikit-learn imblearn

In [2]:
import pandas as pd
import src.utils as utils
import os

from sklearn.model_selection import train_test_split

# Load Configuration File

In [4]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/data.csv',
 'data_set_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_columns_path': 'data/output/input_columns.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'output_column': 'Class',
 'seed': 42,
 'test_size': 0.2,
 'standardizer_path': 'data/output/standardizer.pkl',
 'preprocessor_path': 'data/output/preprocessor.pkl',
 'train_clean_path': ['data/output/X_train_clean.pkl',
  'data/output/y_train_clean.pkl'],
 'valid_clean_path': ['data/output/X_valid_clean.pkl',
  'data/output/y_valid_clean.pkl'],
 'test_clean_path': ['data/output/X_test_clean.pkl',
  'data/output/y_test_clean.pkl']}

In [6]:
CONFIG_DATA['raw_dataset_path']

'data/raw/data.csv'

## Download using Kaggle API


In [7]:
# !pip install kaggle

In [9]:
! kaggle datasets list

ref                                                               title                                                size  lastUpdated          downloadCount  voteCount  usabilityRating  
----------------------------------------------------------------  --------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
alphiree/cardiovascular-diseases-risk-prediction-dataset          Cardiovascular Diseases Risk Prediction Dataset       5MB  2023-07-03 12:12:19           1570         71  1.0              
nelgiriyewithana/countries-of-the-world-2023                      Global Country Information Dataset - 2023            23KB  2023-07-08 20:37:33            531         29  1.0              
arnavsmayan/netflix-userbase-dataset                              Netflix Userbase Dataset                             25KB  2023-07-04 07:38:41           1716         48  1.0              
aaditshukla/flipkart-fasion-products-dataset      

In [12]:
! kaggle datasets download -d mlg-ulb/creditcardfraud

Downloading creditcardfraud.zip to c:\Users\david\Documents\pacmann\anomaly_detection




  0%|          | 0.00/66.0M [00:00<?, ?B/s]
  2%|▏         | 1.00M/66.0M [00:01<01:34, 720kB/s]
  3%|▎         | 2.00M/66.0M [00:02<01:03, 1.05MB/s]
  5%|▍         | 3.00M/66.0M [00:02<00:51, 1.29MB/s]
  6%|▌         | 4.00M/66.0M [00:03<00:44, 1.46MB/s]
  8%|▊         | 5.00M/66.0M [00:03<00:41, 1.52MB/s]
  9%|▉         | 6.00M/66.0M [00:04<00:41, 1.50MB/s]
 11%|█         | 7.00M/66.0M [00:05<00:43, 1.42MB/s]
 12%|█▏        | 8.00M/66.0M [00:06<00:50, 1.21MB/s]
 14%|█▎        | 9.00M/66.0M [00:07<00:48, 1.23MB/s]
 15%|█▌        | 10.0M/66.0M [00:08<00:47, 1.23MB/s]
 17%|█▋        | 11.0M/66.0M [00:08<00:44, 1.31MB/s]
 18%|█▊        | 12.0M/66.0M [00:09<00:45, 1.25MB/s]
 20%|█▉        | 13.0M/66.0M [00:10<00:43, 1.28MB/s]
 21%|██        | 14.0M/66.0M [00:11<00:44, 1.21MB/s]
 23%|██▎       | 15.0M/66.0M [00:12<00:43, 1.22MB/s]
 24%|██▍       | 16.0M/66.0M [00:13<00:40, 1.29MB/s]
 26%|██▌       | 17.0M/66.0M [00:14<00:39, 1.29MB/s]
 27%|██▋       | 18.0M/66.0M [00:14<00:38, 1.29MB/s]
 2

In [15]:
from zipfile import ZipFile

In [20]:
with ZipFile("./creditcardfraud.zip", 'r') as file:
    file.extractall(
        path="./data/raw")

In [21]:
os.rename("data/raw/creditcard.csv", CONFIG_DATA['raw_dataset_path'])

# Data Collection

In [7]:
CONFIG_DATA['raw_dataset_path']

'data/raw/data.csv'

In [10]:
CONFIG_DATA['data_set_path']

'data/output/data.pkl'

In [8]:
def read_data(return_file=True):
    # Read data
    data = pd.read_csv(CONFIG_DATA['raw_dataset_path'], 
                       sep=',',
                    #    index_col=CONFIG_DATA['index_column'])
    )
    # Print data
    print('data shape   :', data.shape)

    # Dump data
    utils.pickle_dump(data, CONFIG_DATA['data_set_path'])

    # Return data
    if return_file:
        return data

In [9]:
data = read_data()
data.head()

data shape   : (284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [27]:
data.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,...,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


# Data Splitting

In [11]:
def split_input_output(return_file=True):
    # Read data
    data = utils.pickle_load(CONFIG_DATA['data_set_path'])

    # Split input & output
    y = data[CONFIG_DATA['output_column']]
    X = data.drop([CONFIG_DATA['output_column']], axis=1)

    # Print splitting
    print('Input shape  :', X.shape)
    print('Output shape :', y.shape)
    print('Input NAN    :')
    print(X.isnull().sum())
    print('Benchmark    :')
    print(y.value_counts(normalize=True))
    
    # Dump file
    utils.pickle_dump(X, CONFIG_DATA['input_set_path'])
    utils.pickle_dump(y, CONFIG_DATA['output_set_path'])
    utils.pickle_dump(X.columns, CONFIG_DATA['input_columns_path'])     # dump input columns

    if return_file:
        return X, y

In [12]:
X, y = split_input_output()

Input shape  : (284807, 30)
Output shape : (284807,)
Input NAN    :
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
dtype: int64
Benchmark    :
0    0.998273
1    0.001727
Name: Class, dtype: float64


In [13]:
CONFIG_DATA['input_set_path']

'data/output/input.pkl'

In [19]:
CONFIG_DATA['test_size']

0.2

In [14]:
def split_train_test(return_file=True):
    # Load data
    X = utils.pickle_load(CONFIG_DATA['input_set_path'])
    y = utils.pickle_load(CONFIG_DATA['output_set_path'])

    # Split test & rest (train & valid)
    X_train, X_test, y_train, y_test = train_test_split( 
                                            X,
                                            y,
                                            test_size = CONFIG_DATA['test_size'],
                                            random_state = CONFIG_DATA['seed']
                                        )
    
    # Split train & valid
    X_train, X_valid, y_train, y_valid = train_test_split(
                                            X_train,
                                            y_train,
                                            test_size = CONFIG_DATA['test_size'],
                                            random_state = CONFIG_DATA['seed']
                                        )
    
    # Print splitting
    print('X_train shape :', X_train.shape)
    print('y_train shape :', y_train.shape)
    print('X_valid shape  :', X_valid.shape)
    print('y_valid shape  :', y_valid.shape)
    print('X_test shape  :', X_test.shape)
    print('y_test shape  :', y_test.shape)

    # Dump file
    utils.pickle_dump(X_train, CONFIG_DATA['train_set_path'][0])
    utils.pickle_dump(y_train, CONFIG_DATA['train_set_path'][1])
    utils.pickle_dump(X_valid, CONFIG_DATA['valid_set_path'][0])
    utils.pickle_dump(y_valid, CONFIG_DATA['valid_set_path'][1])
    utils.pickle_dump(X_test, CONFIG_DATA['test_set_path'][0])
    utils.pickle_dump(y_test, CONFIG_DATA['test_set_path'][1])

    if return_file:
        return X_train, X_valid, X_test, y_train, y_valid, y_test

In [15]:
X_train, X_valid, X_test, y_train, y_valid, y_test = split_train_test()

X_train shape : (182276, 30)
y_train shape : (182276,)
X_valid shape  : (45569, 30)
y_valid shape  : (45569,)
X_test shape  : (56962, 30)
y_test shape  : (56962,)


Get sample for testing

In [16]:
import numpy as np

In [17]:
np.random.seed(123)
y_sample_0 = y_test[y_test==0].sample(10)
y_sample_1 = y_test[y_test==1].sample(10)

y_sample = pd.concat((y_sample_0, y_sample_1), axis=0)
y_sample

108307    0
58290     0
187592    0
126386    0
132845    0
241599    0
146035    0
62972     0
124771    0
45387     0
6903      1
88876     1
43061     1
43428     1
79536     1
226877    1
68067     1
15225     1
143188    1
204503    1
Name: Class, dtype: int64

In [18]:
X_sample = X_test.loc[y_sample.index]
X_sample

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
108307,70845.0,-1.132878,0.777117,1.234338,1.095589,-0.785901,1.071346,-1.067017,1.394525,-0.162929,...,-0.325606,0.239899,0.665532,0.234846,-0.274929,-0.546007,-0.373849,-0.168719,-0.169976,4.91
58290,48321.0,1.196288,0.090215,0.536234,0.847762,-0.488749,-0.686884,0.070069,-0.197166,0.316271,...,-0.01992,-0.209673,-0.416199,-0.016219,0.439244,0.480025,0.264399,-0.019545,0.018169,25.77
187592,127594.0,1.705084,0.162441,-1.813428,3.465877,1.112033,0.333638,0.661591,-0.027992,-1.452406,...,-0.044969,0.084504,-0.2658,-0.030647,0.016665,0.102378,-0.059799,-0.11953,-0.045987,151.29
126386,77996.0,-0.652263,0.53344,1.725238,0.854486,0.10007,0.777838,0.028405,0.432383,-0.400891,...,-0.116243,0.37339,1.052172,-0.046815,-0.296924,-0.636518,-0.253741,0.092936,0.176819,38.0
132845,80140.0,-0.332602,1.138471,1.27443,0.047414,0.061579,-0.954802,0.704335,-0.069569,-0.391728,...,0.110713,-0.268818,-0.698578,-0.027589,0.278157,-0.153859,0.075404,0.246484,0.097549,0.89
241599,151091.0,1.808209,-0.984382,-0.871582,-1.533256,-0.680352,-0.653966,-0.289412,-0.147101,1.65154,...,0.169415,-0.110021,-0.346396,0.185815,-0.37689,-0.391614,-0.573311,0.008084,-0.027949,129.28
146035,87377.0,0.135615,0.999324,-0.42837,-0.552395,0.924707,-0.73896,0.947843,-0.06693,-0.002546,...,-0.00029,-0.34003,-0.842,0.107251,0.491009,-0.440728,0.120315,0.222703,0.083007,5.49
62972,50497.0,1.208878,-0.22779,0.584698,-0.125993,-0.554653,0.078591,-0.604316,0.23319,0.329875,...,-0.127648,0.123703,0.365558,-0.027795,-0.239571,0.150338,1.106084,-0.05276,-0.008848,1.0
124771,77418.0,1.248555,0.262007,0.108759,0.990392,0.189618,0.068305,-0.018768,0.037248,0.11764,...,-0.181013,-0.08916,-0.213208,-0.083647,-0.799815,0.558927,-0.339018,0.033715,0.012361,1.98
45387,42319.0,-1.281051,0.403664,2.155111,1.17706,1.078831,-0.684605,0.200719,-0.275082,-0.510493,...,0.284443,-0.148702,-0.024057,-0.448308,0.087173,0.309002,-0.296291,0.177054,-0.108854,15.0


In [41]:
X_sample.to_csv('data/output/X_sample.csv', index=False)

In [42]:
y_sample.to_csv('data/output/y_sample.csv', index=False)