In [158]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

pd.options.display.float_format = lambda x: f"{x:0,.2f}"
plt.rcParams["figure.figsize"] = (14, 7)

# Credit Default Analysis

* **ID**: ID of each client
* **LIMIT_BAL**: Amount of given credit in NT dollars (includes individual and family/supplementary credit
* **SEX**: Gender (1=male, 2=female)
* **EDUCATION**: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
* **MARRIAGE**: Marital status (1=married, 2=single, 3=others)
* **AGE**: Age in years
* **PAY_1**: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, ... 8=payment delay for eight months, 9=payment delay for nine months and above)
* **PAY_2**: Repayment status in August, 2005 (scale same as above)
* **PAY_3**: Repayment status in July, 2005 (scale same as above)
* **PAY_4**: Repayment status in June, 2005 (scale same as above)
* **PAY_5**: Repayment status in May, 2005 (scale same as above)
* **PAY_6**: Repayment status in April, 2005 (scale same as above)
* **BILL_AMT1**: Amount of bill statement in September, 2005 (NT dollar)
* **BILL_AMT2**: Amount of bill statement in August, 2005 (NT dollar)
* **BILL_AMT3**: Amount of bill statement in July, 2005 (NT dollar)
* **BILL_AMT4**: Amount of bill statement in June, 2005 (NT dollar)
* **BILL_AMT5**: Amount of bill statement in May, 2005 (NT dollar)
* **BILL_AMT6**: Amount of bill statement in April, 2005 (NT dollar)
* **PAY_AMT1**: Amount of previous payment in September, 2005 (NT dollar)
* **PAY_AMT2**: Amount of previous payment in August, 2005 (NT dollar)
* **PAY_AMT3**: Amount of previous payment in July, 2005 (NT dollar)
* **PAY_AMT4**: Amount of previous payment in June, 2005 (NT dollar)
* **PAY_AMT5**: Amount of previous payment in May, 2005 (NT dollar)
* **PAY_AMT6**: Amount of previous payment in April, 2005 (NT dollar)
* **default.payment.next.month**: Default payment (1=yes, 0=no)

In [148]:
credit = pd.read_csv("./UCI_Credit_Card.csv", index_col=0)
credit.rename(columns={"default.payment.next.month": "default",
                       "PAY_0": "PAY_1"}, inplace=True)
credit = pd.get_dummies(credit, columns=["SEX", "EDUCATION", "MARRIAGE"])

We split the data into train and test; set 20% of the observations to be into the test set

In [149]:
credit_train, credit_test = train_test_split(credit, test_size=0.2, random_state=1643)
credit_train.shape

(24000, 34)

In [150]:
credit_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
LIMIT_BAL,24000.0,168171.24,129705.96,10000.0,50000.0,140000.0,240000.0,800000.0
AGE,24000.0,35.49,9.2,21.0,28.0,34.0,41.0,75.0
PAY_1,24000.0,-0.02,1.12,-2.0,-1.0,0.0,0.0,8.0
PAY_2,24000.0,-0.14,1.19,-2.0,-1.0,0.0,0.0,7.0
PAY_3,24000.0,-0.17,1.2,-2.0,-1.0,0.0,0.0,8.0
PAY_4,24000.0,-0.22,1.17,-2.0,-1.0,0.0,0.0,8.0
PAY_5,24000.0,-0.27,1.14,-2.0,-1.0,0.0,0.0,8.0
PAY_6,24000.0,-0.29,1.15,-2.0,-1.0,0.0,0.0,8.0
BILL_AMT1,24000.0,51424.47,73518.14,-165580.0,3605.75,22598.0,67723.75,746814.0
BILL_AMT2,24000.0,49372.12,70960.84,-67526.0,2999.75,21419.0,64677.0,743970.0


**Initial Observations:**  
* 60% of observations come from women; 40% men.
* 53% of all credit card holders are married
* 47% attended a university, 35% graduate schools, 16% high school

In [220]:
pd.DataFrame([credit_train[f"PAY_{v}"].value_counts() for v in range(1, 7)])[[-1, *range(1, 9)]]

Unnamed: 0,-1,1,2,3,4,5,6,7,8
PAY_1,4505.0,2949.0,2127.0,262.0,60.0,19.0,8.0,7.0,16.0
PAY_2,4811.0,21.0,3127.0,262.0,74.0,20.0,9.0,17.0,
PAY_3,4717.0,3.0,3031.0,189.0,63.0,18.0,20.0,20.0,3.0
PAY_4,4541.0,1.0,2520.0,139.0,58.0,31.0,3.0,49.0,1.0
PAY_5,4407.0,,2084.0,144.0,72.0,13.0,4.0,47.0,1.0
PAY_6,4578.0,,2205.0,150.0,40.0,12.0,15.0,39.0,1.0


In [226]:
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
from sklearn.preprocessing import Normalizer

In [193]:
def fetch_batches(X, y, n_batches, set_seed=None):
    """
    Retrieve the i-th batch from a random shuffled
    training dataset. Each epoch the training
    dataset gets reshuffled.
    """
    np.random.seed(set_seed)
    batches = np.c_[X, y]
    np.random.shuffle(batches)
    batches = np.array_split(batches, n_batches)
    for batch in batches:
        yield batch[:, :-1], batch[:, -1]

### Construction Phase

In [296]:
norm = Normalizer()
X_train = credit_train.drop("default", axis=1).values
X_train = norm.fit_transform(X_train)

y_train = credit_train["default"].values

In [334]:
tf.reset_default_graph()
nh1 = 200
nh2 = 100
nh3 = 50
nout = 2

X = tf.placeholder(tf.float32, shape=(None, 33))
y = tf.placeholder(tf.int64, shape=(None))

with tf.name_scope("Layers"):
    layer1 = fully_connected(X, nh1, scope="hidden_layer1")
    layer2 = fully_connected(layer1, nh2, scope="hidden_layer2")
    layer3 = fully_connected(layer2, nh3, scope="hidden_layer3")
    output = fully_connected(layer3, nout, activation_fn=None)

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                              logits=output)
    loss = tf.reduce_mean(xentropy, name="loss")

learning_rate = 0.0001
with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    train_step = optimizer.minimize(loss)
    
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(output, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

### Execution Phase

In [358]:
nepochs = 50
batch_size = 50

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(1, nepochs + 1):
        """
        for X_batch, y_batch in fetch_batches(X_train, y_train, batch_size):
            sess.run(train_step, feed_dict={X:X_batch, y:y_batch})
        """
        sess.run(train_step, feed_dict={X:X_train, y:y_train})
        train_acc = sess.run(accuracy, feed_dict={X:X_train, y:y_train})
        end = "\n" if epoch %20 == 0 else "\r"
        print(f"@Epoch {epoch}, accuracy: {train_acc:0.2%}", end=end)
    save_path = saver.save(sess, ".tmp/credit_model.ckpt")
    vals = sess.run(output, feed_dict={X:X_train, y:y_train})

@Epoch 20, accuracy: 77.84%
@Epoch 40, accuracy: 77.84%
@Epoch 50, accuracy: 77.84%

In [359]:
np.mean(np.argmax(vals, axis=1) == y_train)

0.77841666666666665

In [360]:
from sklearn.linear_model import LogisticRegression
lreg = LogisticRegression()
lreg.fit(X_train, y_train)
lreg.score(X_train, y_train)

0.77829166666666671

## Evaluating a Model

In [381]:
X_test = norm.transform(credit_test.drop("default", axis=1).values)
y_test = credit_test.default.values

lreg_yhat = lreg.predict(X_test)
lreg.score(X_test, y_test)

0.78016666666666667

In [382]:
X_test

array([[  9.11240073e-01,   7.89741397e-04,  -6.07493382e-05, ...,
          3.03746691e-05,   0.00000000e+00,   0.00000000e+00],
       [  6.26542732e-01,   5.51357604e-04,   0.00000000e+00, ...,
          1.25308546e-05,   0.00000000e+00,   0.00000000e+00],
       [  9.85441568e-01,   1.80664288e-04,  -4.10600654e-06, ...,
          4.10600654e-06,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  4.18610082e-01,   8.37220164e-05,   0.00000000e+00, ...,
          0.00000000e+00,   2.32561157e-06,   0.00000000e+00],
       [  9.79385985e-01,   9.37412300e-04,   1.39912284e-05, ...,
          1.39912284e-05,   0.00000000e+00,   0.00000000e+00],
       [  9.90247323e-01,   3.09452288e-04,  -1.23780915e-05, ...,
          0.00000000e+00,   1.23780915e-05,   0.00000000e+00]])

In [368]:
with tf.Session() as sess:
    saver.restore(sess, save_path)
    test_acc = sess.run(accuracy, feed_dict={X:X_test, y:y_test})
    nn_logits = sess.run(output, feed_dict={X:X_test, y:y_test})
    nn_yhat = np.argmax(nn_logits, axis=1)
print(test_acc)

INFO:tensorflow:Restoring parameters from .tmp/credit_model.ckpt
0.780333


In [370]:
nn_logits

array([[ 0.19291919, -0.1728971 ],
       [ 0.17752418, -0.15250786],
       [ 0.19145267, -0.19433053],
       ..., 
       [ 0.16478929, -0.17826571],
       [ 0.20565552, -0.18298566],
       [ 0.20151311, -0.1928167 ]], dtype=float32)

In [366]:
print(nn_yhat[:1000])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [371]:
from sklearn.metrics import precision_score, recall_score, accuracy_score
nn_precision = precision_score(y_test, nn_yhat)
nn_recall = recall_score(y_test, nn_yhat)
nn_accuracy = accuracy_score(y_test, nn_yhat)

lreg_precision = precision_score(y_test, lreg_yhat)
lreg_recall = recall_score(y_test, lreg_yhat)

print(nn_precision)

0.0


  'precision', 'predicted', average, warn_for)
