## Import Libraries

In [352]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

## Load Datasets

In [362]:
# load dataset
data_set1 = pd.read_csv('dataset/credit_train.csv', header=0)
data_set2 = pd.read_csv('dataset/credit_test.csv', header=0)

In [363]:
data_set1 = data_set1.iloc[:100000]
data_set2 = data_set2.iloc[:10000]

In [364]:
df = data_set1.append(data_set2, ignore_index=True)

## Preprocessing - Drop Columns

In [365]:
# Drop irrelevant columns
drop_cols = ['Loan ID', 'Customer ID', 'Purpose','Monthly Debt',
             'Number of Open Accounts','Maximum Open Credit','Bankruptcies','Tax Liens']

In [366]:
df.drop(drop_cols, axis=1, inplace=True)

## Preprocessing - Drop NaN

In [367]:
df = df[pd.notnull(df['Credit Score'])]
df.isnull().sum()

Annual Income                       0
Credit Score                        0
Current Credit Balance              0
Current Loan Amount                 0
Home Ownership                      0
Loan Status                      8019
Months since last delinquent    46935
Number of Credit Problems           0
Term                                0
Years in current job             3754
Years of Credit History             0
dtype: int64

# Turning Credit Score into Binary

In [368]:
df['Credit Score'].head()

0     709.0
2     741.0
3     721.0
5    7290.0
6     730.0
Name: Credit Score, dtype: float64

In [369]:
ACCEPT = 1
DENY = 0
CUTOFF = df['Credit Score'].median()


# Adding a column for y
df['Credit Score'] = df['Credit Score'].apply(lambda x: 1 if x > CUTOFF else 0)
df.head()

Unnamed: 0,Annual Income,Credit Score,Current Credit Balance,Current Loan Amount,Home Ownership,Loan Status,Months since last delinquent,Number of Credit Problems,Term,Years in current job,Years of Credit History
0,1167493.0,0,228190.0,445412.0,Home Mortgage,Fully Paid,,1.0,Short Term,8 years,17.2
2,2231892.0,1,297996.0,99999999.0,Own Home,Fully Paid,29.0,1.0,Short Term,8 years,14.9
3,806949.0,0,256329.0,347666.0,Own Home,Fully Paid,,0.0,Long Term,3 years,12.0
5,896857.0,1,215308.0,206602.0,Home Mortgage,Charged Off,,0.0,Short Term,10+ years,17.3
6,1184194.0,1,122170.0,217646.0,Home Mortgage,Fully Paid,10.0,1.0,Short Term,< 1 year,19.6


In [370]:
y, X = df['Credit Score'], df.drop(['Credit Score'], axis=1)
X.columns

Index(['Annual Income', 'Current Credit Balance', 'Current Loan Amount',
       'Home Ownership', 'Loan Status', 'Months since last delinquent',
       'Number of Credit Problems', 'Term', 'Years in current job',
       'Years of Credit History'],
      dtype='object')

In [371]:
y = pd.Series(y.loc[:].fillna('').values.astype('str').tolist()).str.join('')
X = pd.Series(X.loc[:].fillna('').values.astype('str').tolist()).str.join('')

In [372]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [373]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2)

# Modeling

In [374]:
# Vectorize
cvec = CountVectorizer()
cvec.fit(X_train)
X_train_vec = cvec.transform(X_train)
X_test_vec = cvec.transform(X_test)
X_valid_vec = cvec.transform(X_valid)

In [375]:
X_train_vec

<56873x74470 sparse matrix of type '<class 'numpy.int64'>'
	with 506658 stored elements in Compressed Sparse Row format>

In [381]:
# Modeling
def simpleNN():
    model = Sequential()
    model.add(Dense(100, input_dim=input_dim, activation='relu', use_bias=True))
    model.add(Dropout(rate=0.6))
    model.add(Dense(output_dim, activation='hard_sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model

def multilayerNN():
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu', use_bias=True))
    model.add(Dropout(rate=0.6))
    model.add(Dense(64, activation='relu', use_bias=True))
    model.add(Dropout(rate=0.6))
    model.add(Dense(32, activation='relu', use_bias=True))
    model.add(Dropout(rate=0.6))
    model.add(Dense(output_dim, activation='hard_sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model

In [384]:
# Parameters
input_dim = X_train_vec.shape[1]
output_dim = 1
model = multilayerNN()
epochs = 5
batch_size = 256

In [385]:
history = model.fit(X_train_vec, y_train,
                    epochs=epochs,
                    verbose=1,
                    validation_data = (X_valid_vec, y_valid),
                    batch_size=batch_size)
print(model.summary())

loss, accuracy = model.evaluate(X_train_vec, y_train, verbose=False)
print('Training Accuracy: {:.4f}'.format(accuracy))
loss, accuracy = model.evaluate(X_valid_vec, y_valid, verbose=False)
print('Validation Accuracy: {:.4f}'.format(accuracy))
loss, accuracy = model.evaluate(X_test_vec, y_test, verbose=False)
print('Testing Accuracy: {:.4f}'.format(accuracy))

Train on 56873 samples, validate on 14219 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_35 (Dense)             (None, 128)               9532288   
_________________________________________________________________
dropout_18 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_36 (Dense)             (None, 64)                8256      
_________________________________________________________________
dropout_19 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_37 (Dense)             (None, 32)                2080      
_________________________________________________________________
dropout_20 (Dropout)         (None, 32)                0         
__________________________________________

In [311]:
seed = 7
np.random.seed(seed)
estimator = KerasClassifier(build_fn=simpleNN,
                            epochs=epochs,
                            batch_size=batch_size,
                            verbose=1)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X_train_vec, y_train, cv=kfold)
print('K-fold Validation Results: %.2f%% (%.2f%%)' % (results.mean()*100, results.std()*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
11264/56874 [====>.........................] - ETA: 1:13 - loss: 0.4797 - acc: 0.7882

KeyboardInterrupt: 