In [1]:
# This section is importing all libraries & data used for the project

# importing all necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
# importing the training and testing data file
train = pd.read_csv("./DataFiles/CreditCard_train.csv", header=1, index_col='ID')
test = pd.read_csv("./DataFiles/CreditCard_test.csv", header=1, index_col='ID')

# From our training data, we want to split a validation set
train, val = train_test_split(train, test_size=0.1)

print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

train

21600 train examples
2400 validation examples
6000 test examples


Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11652,130000,2,1,2,30,1,-2,-2,-2,-2,...,-915,-915,-1070,0,0,0,0,0,0,0
12289,30000,1,2,2,35,0,0,2,2,-2,...,0,0,0,3026,0,0,0,0,0,0
9793,150000,2,2,1,28,0,0,0,0,0,...,70371,72257,72536,5008,5000,2499,3000,2789,2439,0
8526,150000,2,2,1,36,1,2,2,4,3,...,139097,138592,141447,9734,14500,0,2000,5300,5000,0
4644,500000,2,1,1,37,0,0,0,0,0,...,287998,298343,308378,10000,15000,15000,15000,15000,15200,0
20720,50000,1,3,2,34,0,0,0,0,0,...,25556,24389,25436,4500,4000,3000,2500,4000,2500,0
2054,140000,1,3,1,37,2,2,2,2,2,...,136438,144979,139314,6400,5000,0,10800,0,5300,0
11666,200000,2,2,2,34,-1,-1,0,-1,-1,...,28300,13227,11202,13805,6797,28314,13291,11258,11912,1
11761,140000,1,1,1,27,1,-2,-2,-2,-2,...,-488,-488,-488,0,0,0,0,0,0,0
18096,300000,1,1,1,38,0,0,0,0,0,...,78887,57682,35493,4511,3300,4505,4200,2500,25151,0


In [3]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('default payment next month') #Our target
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe),labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [4]:
batch_size = 5 
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)

### Let's illustrate the data we feed in through tensorflow dataset

In [5]:
for feature_batch, label_batch in train_ds.take(1):
    print('Every feature:', list(feature_batch.keys()))
    print('A batch of ages:', feature_batch['AGE'])
    print('A batch of targets:', label_batch )

Every feature: ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
A batch of ages: tf.Tensor([41 33 29 33 36], shape=(5,), dtype=int32)
A batch of targets: tf.Tensor([0 0 0 0 0], shape=(5,), dtype=int32)


### Next step is to select the features with corresponding data type

In [6]:
feature_columns = []

# Numeric columns
numerics = [
    'LIMIT_BAL', 
    'AGE',
    'BILL_AMT1', 
    'BILL_AMT2', 
    'BILL_AMT3', 
    'BILL_AMT4', 
    'BILL_AMT5', 
    'BILL_AMT6', 
    'PAY_AMT1', 
    'PAY_AMT2', 
    'PAY_AMT3', 
    'PAY_AMT4', 
    'PAY_AMT5', 
    'PAY_AMT6'
]
for header in numerics:
    feature_columns.append(tf.feature_column.numeric_column(header))

# Categorical columns
categoricals = [
    'SEX', 
    'EDUCATION', 
    'MARRIAGE', 
    'PAY_0', 
    'PAY_2', 
    'PAY_3', 
    'PAY_4', 
    'PAY_5', 
    'PAY_6'
]

# Since the categorical variables take so many levles, we need to use embedding 
# to reduce the dimensionality

dimension = [
    2,
    3,
    3,
    15,
    15,
    15,
    15,
    15,
    15
]
i = 0
for header in categoricals:
    category = tf.feature_column.categorical_column_with_identity(
        header, len(train[header].unique()), default_value=0
    )
    category_one_hot = tf.feature_column.indicator_column(category)
    feature_columns.append(category_one_hot)
    
#     Now let's add the emedding layer
    category_embedding = tf.feature_column.embedding_column(category, dimension=dimension[i])
    feature_columns.append(category_embedding)
    i += 1

In [7]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [8]:
batch_size = 64
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [9]:
model = tf.keras.Sequential([
    feature_layer,
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

sgd = tf.keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, clipnorm=1.)
model.compile(optimizer=sgd,
              loss='binary_crossentropy',
              metrics=['accuracy'],
              run_eagerly=True)

model.fit(train_ds, validation_data=val_ds, epochs=3)


Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x112180890>

In [10]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.789


### Now let's try to improve the model
3 different adjustment will be made:
1. 'AGE' will be transformed into bucket variable, since it makes more sense to treat them like people of different age group.
2. Try to standardize the numeric variables.
3. Do a PCA to the numerics and see if we can trhow away some variables.

In [11]:
# Firstly, 'AGE' as bucket input

feature_columns = []

age = tf.feature_column.numeric_column("AGE")

age_buckets = tf.feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

In [12]:
# Numeric columns to be standardised
raw_numerics = [
    'AGE',
    'LIMIT_BAL', 
    'BILL_AMT1', 
    'BILL_AMT2', 
    'BILL_AMT3', 
    'BILL_AMT4', 
    'BILL_AMT5', 
    'BILL_AMT6', 
    'PAY_AMT1', 
    'PAY_AMT2', 
    'PAY_AMT3', 
    'PAY_AMT4', 
    'PAY_AMT5', 
    'PAY_AMT6'
]

# 9 principal components are kept after PCA
principalColumns = [
    'PC1',
    'PC2',
    'PC3',
    'PC4',
    'PC5',
    'PC6',
    'PC7',
    'PC8', 
    'PC9',
]

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
pca = PCA(0.95)

# Firstly we fit only the train data
scaler.fit(train.loc[:, raw_numerics])

# Then we apply the transformation to all the datasets
train_img = scaler.transform(train.loc[:, raw_numerics])
test_img = scaler.transform(test.loc[:, raw_numerics])
val_img = scaler.transform(val.loc[:, raw_numerics])

pca.fit(train_img)


train_img = pca.transform(train_img)
test_img = pca.transform(test_img)
val_img = pca.transform(val_img)

principalTrain = pd.DataFrame(data=train_img,
                             columns=principalColumns,
                             index=train.index)
principalTest = pd.DataFrame(data=test_img,
                            columns=principalColumns,
                            index=test.index)
principalVal = pd.DataFrame(data=val_img,
                           columns=principalColumns,
                           index=val.index)


train = pd.concat([principalTrain, train], axis=1)
test = pd.concat([principalTest, test], axis=1)
val = pd.concat([principalVal, val], axis=1)

for header in principalColumns:
    feature_columns.append(tf.feature_column.numeric_column(header))
    
# Categorical columns
categoricals = [
    'SEX', 
    'EDUCATION', 
    'MARRIAGE', 
    'PAY_0', 
    'PAY_2', 
    'PAY_3', 
    'PAY_4', 
    'PAY_5', 
    'PAY_6'
]

# Since the categorical variables take so many levles, we need to use embedding 
# to reduce the dimensionality

dimension = [
    4,
    10,
    6,
    15,
    15,
    15,
    15,
    15,
    15
]
i = 0
for header in categoricals:
    category = tf.feature_column.categorical_column_with_identity(
        header, len(train[header].unique()), default_value=0
    )
    category_one_hot = tf.feature_column.indicator_column(category)
    feature_columns.append(category_one_hot)
    
#     Now let's add the emedding layer
    category_embedding = tf.feature_column.embedding_column(category, dimension=dimension[i])
    feature_columns.append(category_embedding)
    i += 1

In [13]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [14]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [15]:
model = tf.keras.Sequential([
    feature_layer,
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

adam = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=True)
model.compile(optimizer=adam,
              loss='binary_crossentropy',
              metrics=['accuracy'],
              run_eagerly=True)

model.fit(train_ds, validation_data=val_ds, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x112180b50>

In [16]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.8325
