Loan Default Prediction Competition submission
Christian Magpantay

### Install files needed and mRMR libraries

In [None]:
# https://www.tensorflow.org/neural_structured_learning
# https://www.machinecurve.com/index.php/2021/01/07/build-an-lstm-model-with-tensorflow-and-keras/
# https://www.tensorflow.org/tutorials/images/cnn
!pip install -q kaggle
from google.colab import files
files.upload()
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle
!kaggle competitions download -c loan-default-prediction

In [None]:
!unzip loan-default-prediction.zip 

In [None]:
!unzip test_v2.csv.zip
!unzip train_v2.csv.zip

In [None]:
# install for mRMR
!pip install numpy Cython
!pip install -U pymrmr

### Read in csv files into pandas

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
tes_d = pd.read_csv('test_v2.csv')

In [None]:
# read data
tra_d = pd.read_csv('train_v2.csv')
print(tra_d.shape)

### Data cleaning by removing categorical columns

In [None]:
# replace all na's and nans with zeros
tra_d = tra_d.replace(['NA'], 0)
tra_d = tra_d.replace([np.nan], 0)
nonNumFeats = []
tra_d.dropna()

# remove all cols that cannot use mean/std function
# by adding them into a list to remove
for col in tra_d.columns[1:]:
    try :
        tra_d[col].mean()
        tra_d[col].std()
    except TypeError as e:
        nonNumFeats.append(col)
    except ValueError as e:
        nonNumFeats.append(col)

# drop all categorical cols from nonNumFeats list
tra_d.drop(nonNumFeats[0:], axis = 1, inplace = True)

In [None]:
print(tra_d.shape)

### Remove excessive results/loss with zeroes but keeping several hundred to prep data for discretizing

In [None]:
# create lists with loss/result is not zero
# and loss is equal to zero
nonZeroesLoss = []
zeroLoss = []

# inspect loss col to find non zeroes
for i in range(len(tra_d)):
    if tra_d.at[i, 'loss'] != 0:
        nonZeroesLoss.append(i)

# sum the amount of non zeroes and 10000
# to include zeroes into the dataset for mRMR
limit = 10000 - len(nonZeroesLoss)
x = 0
for i in range(len(tra_d)):
    if x == limit:
        break
    if tra_d.at[i, 'loss'] == 0:
        zeroLoss.append(i)
        x += 1
    
# create a list that has the rows selected for mRMR
# and sort it in ascending order
selec_rows = nonZeroesLoss+zeroLoss
selec_rows.sort(reverse=False)

# reload the training dataset into a new training dataset
tra_d = pd.DataFrame(tra_d, index=selec_rows[0:])
print(tra_d.shape)

# create a copy for discretizing
tra_discretized = tra_d.copy(deep=True)
print(tra_discretized.head())

# reduce dataset from 100,000+ -> 10,000

In [None]:
# create boundaries for each col using the mean and std of each col
# create a left edge for numbers below mean minus the std of the col
# create a right edge for numbers below mean and the std of the col
# compare each cell in their respective cols to decide if:
#   the cell's number is below the left edge = -2
#   the cell's number is above the right edge = 2
#   the cell's number is between the edges = 0
for col in tra_discretized.columns[1:-1]:
    col_mu = tra_discretized[col].mean()
    col_std = tra_discretized[col].std()
    left_edge = col_mu - col_std
    right_edge = col_mu + col_std
    for row in selec_rows:
        val = tra_discretized.at[row, col]
        if val < left_edge:
            tra_discretized.at[row, col] = -2
        elif val >= left_edge and val <= right_edge:
            tra_discretized.at[row, col] = 0
        elif val > right_edge:
            tra_discretized.at[row, col] = 2

print(tra_discretized.head())

   id  f1  f2   f3  f4  f5  f6  ...  f773  f774  f775  f776  f777  f778  loss
0   1   0   2  0.0  -2   0   0  ...   0.0   0.0   0.0     2     0     0     0
1   2   0   2  0.0  -2   0   2  ...   0.0   0.0   0.0     2     0     0     0
2   3   0   2  0.0  -2   0   2  ...   0.0   0.0   0.0     2     0     0     0
3   4   0   2  0.0  -2   0   2  ...   0.0   0.0   0.0     2     0     0     0
4   5  -2   0  0.0   0   0   0  ...   0.0   0.0   0.0     0     0     0     0

[5 rows x 752 columns]


In [None]:
print(tra_discretized.shape)

(10000, 752)


### Run MRMR to find and remove redundant columns

In [None]:
# create a list from the returned list of pymrmr
# use list to remove cols
import pymrmr
selec_feats = pymrmr.mRMR(tra_discretized[tra_discretized.columns[1:-1]], 'MID', 23)
tra_d.drop(selec_feats[0:], axis = 1, inplace = True)
print(tra_d.shape)

(10000, 729)


Prep data for testing using discretize table into CNNs

In [None]:
selected_labels = pd.DataFrame(tra_d, index=selec_rows[0:], columns=['loss'])

In [None]:
tra_discretized.drop(selec_feats[0:], axis = 1, inplace = True)
select_feats_df = tra_d.copy(deep=True)

In [None]:
#  Prepare data.
X_train = np.array(tra_discretized)
y_train = selected_labels
print(X_train)

### For visuals with tensorboard

In [None]:
%load_ext tensorboard

In [None]:
y_train = y_train.astype(float)
some_precision = 2
for i in selec_rows[0:]:
    a = y_train.at[i, 'loss'] / 100
    num = '{0:.{1}f}'.format(a,some_precision)
    y_train.at[i, 'loss'] = num

### CNNs

In [None]:
import datetime
import tensorflow as tf
from tensorflow import keras
from keras import layers
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences

def train_step(model, optimizer, x_train, y_train):
  with tf.GradientTape() as tape:
    predictions = model(x_train, training=True)
    loss = loss_object(y_train, predictions)
  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  train_loss(loss)
  train_accuracy(y_train, predictions)

X_train = X_train.reshape(len(X_train),27,27,1)

model = keras.models.Sequential()
model.add(layers.Conv2D(16, (1,1), activation='relu', input_shape=(27,27,1)))
model.add(layers.BatchNormalization())
model.add(layers.Conv2D(16, (1,1), activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.25))

model.add(layers.Conv2D(32, (1,1), activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Conv2D(32, (1,1), activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.25))

model.add(layers.Conv2D(64, (1,1), activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Conv2D(64, (1,1), activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.25))

model.add(layers.Flatten())
model.add(layers.Dense(101, activation="softmax")) # labels

model.summary()

op = tf.keras.optimizers.SGD(learning_rate = 0.01)
model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='sgd',
              metrics='accuracy')

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

history = model.fit(X_train, y_train, epochs=1, 
                    batch_size=16, callbacks=[tensorboard_callback])


In [None]:
%tensorboard --logdir logs

### Prep for submission

In [None]:
print(tes_d.shape)
tes_d['loss'] = 0
X_test = np.array(tes_d)
X_test = X_test.reshape(len(X_test),27,27,1)
y_model = model.predict(X_test)
print(y_model)

In [None]:
print(y_model.shape)
y_pred = [0] * len(y_model)
for i in range(len(y_model)):
    for j in range(len(y_model[0])):
        if y_model[i][j] == 1.0:
            y_pred[i] = j-1


In [None]:
print(len(y_pred))

### Predictions for testing data

In [None]:
res_df = pd.DataFrame(tes_d, columns=['id','loss'])
for i in range(len(res_df)-3):
    res_df['loss'].iat[i] = y_pred[i]
print(res_df)
sample_submission = res_df[['id','loss']]
sample_submission.to_csv('sample_submission_rf.csv', index = False)
# res_df['loss'].iat[-1] = y_pred[-1]