## Import Libraries

In [244]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

## Load Datasets

In [222]:
# load dataset
data_set1 = pd.read_csv('dataset/credit_train.csv', header=0)
data_set2 = pd.read_csv('dataset/credit_test.csv', header=0)

In [223]:
data_set1 = data_set1.iloc[:100000]
data_set2 = data_set2.iloc[:10000]

In [224]:
df = data_set1.append(data_set2, ignore_index=True)

## Preprocessing - Drop Columns

In [225]:
# Drop irrelevant columns
drop_cols = ['Loan ID', 'Customer ID', 'Purpose','Monthly Debt',
             'Number of Open Accounts','Maximum Open Credit','Bankruptcies','Tax Liens']

In [226]:
df.drop(drop_cols, axis=1, inplace=True)

## Preprocessing - Drop NaN

In [227]:
df = df.dropna()
df.isnull().sum()

Annual Income                   0
Credit Score                    0
Current Credit Balance          0
Current Loan Amount             0
Home Ownership                  0
Loan Status                     0
Months since last delinquent    0
Number of Credit Problems       0
Term                            0
Years in current job            0
Years of Credit History         0
dtype: int64

# Turning Credit Score into Binary

In [228]:
df['Credit Score'].head()

2     741.0
6     730.0
8     678.0
10    728.0
12    740.0
Name: Credit Score, dtype: float64

In [229]:
ACCEPT = 1
DENY = 0
CUTOFF = df['Credit Score'].median()


# Adding a column for y
df['Accept'] = df['Credit Score'] > CUTOFF
df.head()

Unnamed: 0,Annual Income,Credit Score,Current Credit Balance,Current Loan Amount,Home Ownership,Loan Status,Months since last delinquent,Number of Credit Problems,Term,Years in current job,Years of Credit History,Accept
2,2231892.0,741.0,297996.0,99999999.0,Own Home,Fully Paid,29.0,1.0,Short Term,8 years,14.9,True
6,1184194.0,730.0,122170.0,217646.0,Home Mortgage,Fully Paid,10.0,1.0,Short Term,< 1 year,19.6,True
8,2559110.0,678.0,437171.0,548746.0,Rent,Fully Paid,33.0,0.0,Short Term,2 years,22.6,False
10,714628.0,728.0,203965.0,99999999.0,Rent,Fully Paid,76.0,0.0,Short Term,3 years,16.0,True
12,776188.0,740.0,134083.0,99999999.0,Own Home,Fully Paid,25.0,0.0,Short Term,< 1 year,8.5,True


In [230]:
y, X = df['Accept'], df.drop(['Accept', 'Credit Score'], axis=1)

In [231]:
y = pd.Series(y.loc[:].fillna('').values.astype('str').tolist()).str.join('')
X = pd.Series(X.loc[:].fillna('').values.astype('str').tolist()).str.join('')

In [232]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [234]:
X_train.head()

36286    1279745.098648.0437910.0Home MortgageFully Pai...
11838    1745226.0304931.0359260.0Own HomeFully Paid23....
16774    869288.0159467.0214720.0RentFully Paid59.00.0S...
35992    1090372.0130093.099999999.0Own HomeFully Paid4...
31049    1772700.0602015.0194458.0Home MortgageFully Pa...
dtype: object

# Modeling

In [237]:
# Vectorize
cvec = CountVectorizer()
cvec.fit(X_train)
X_train_vec = cvec.transform(X_train)
X_test_vec = cvec.transform(X_test)

In [238]:
X_train_vec

<29268x47393 sparse matrix of type '<class 'numpy.int64'>'
	with 280910 stored elements in Compressed Sparse Row format>

In [242]:
# Modeling
def simpleNN():
    model = Sequential()
    model.add(Dense(100, input_dim=input_dim, activation='relu', use_bias=True))
    model.add(Dropout(rate=0.6))
    model.add(Dense(output_dim, activation='hard_sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model

In [245]:
input_dim = X_train_vec.shape[1]
output_dim = 1
model = simpleNN()

seed = 7
np.random.seed(seed)
estimator = KerasClassifier(build_fn=simpleNN,
                            epochs=5,
                            batch_size=256,
                            verbose=1)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X_train_vec, y_train, cv=kfold)
print('K-fold Validation Results: %.2f%% (%.2f%%)' % (results.mean()*100, results.std()*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
K-fold Validation Results: 68.64% (0.58%)
