In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#custom functions
import os, sys
sys.path.insert(0, '..')
from src.models.config import SEED
from src.utils import performance_rank_df, performance_rank_n, performance_rank_f1_opt
from src.utils import plot_precision_recall, plot_loss, plot_auc, plot_history, plot_metrics, plot_rank
from src.utils import plot_label_clusters, plot_label_clusters_vae, plot_label_clusters_cvae
from src.utils import save_report_json, save_report_pandas_to_csv
from src.utils import save_model_joblib, save_model_parameters_pkl, save_model_keras
from src.utils import reset_random_seeds


In [3]:
import warnings
from datetime import datetime
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import pyplot
mpl.rcParams['figure.figsize'] = (12, 4)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
warnings.filterwarnings('ignore')
%matplotlib inline
import seaborn as sns

import os
os.environ['PYTHONHASHSEED']=str(SEED)
import numpy as np
np.random.seed(SEED)
np.set_printoptions(precision=4)
import random as python_random
python_random.seed(SEED)
import tensorflow as tf
tf.keras.backend.clear_session()
tf.random.set_seed(SEED)
from tensorflow import keras
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Input, Dense, Activation, Dropout, BatchNormalization, Lambda
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
from tensorflow.keras import utils

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [4]:
print("TensorFlow version: {}".format(tf.__version__))
print("TensorFlow keras version: {}".format(tf.keras.__version__))
print("Eager execution: {}".format(tf.executing_eagerly()))

TensorFlow version: 2.0.0
TensorFlow keras version: 2.2.4-tf
Eager execution: True


# Read Data

In [5]:
df = pd.read_csv('../data/raw/creditcard.csv') #pd.read_csv('https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
df.shape

(284807, 31)

In [7]:
neg, pos = np.bincount(df['Class'])
total = neg + pos
pos / total

0.001727485630620034

# Data pre-processing

In [8]:
y=df['Class']
df_X=df.drop('Class',1)

df_X['Amount_log'] = np.log(df_X['Amount'] + 1)
df_X=df_X.drop(['Amount', 'Time'],1)

le= LabelEncoder().fit(y)
encoded_Y = le.transform(y)  # convert categorical labels to integers
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = utils.to_categorical(encoded_Y)

print(dummy_y[:4])

X_train, X_test, y_train, y_test = train_test_split(df_X, dummy_y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.10, random_state=42)

print("X shape: ", X_train.shape, X_val.shape, X_test.shape)
print("Y shape: ", y_train.shape, y_val.shape, y_test.shape)

[[1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]
X shape:  (205060, 29) (22785, 29) (56962, 29)
Y shape:  (205060, 2) (22785, 2) (56962, 2)
