# Run an LSTM model for the two datasets and explain their performance (extra credits)

In [165]:
# LSTM for international airline passengers problem with regression framing
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import read_csv
import math
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [166]:
# load the dataset
filename=['credit_risk_small_data_0.02.csv', 'credit_data_sim.csv']
data = read_csv(filename[0])
data

Unnamed: 0,Delinquency,Revolving Credit Percentage,Capital Reserves,Num Late 60,Debt Ratio,Monthly Income,Num Credit Lines,Num Late Past 90,Num Real Estate,Num Late 90,Num Employees
0,0,0.390912,53,0,0.241776,10000,9,0,1,0,2
1,0,0.037574,64,0,0.323588,10500,11,0,2,0,0
2,0,0.002058,59,0,0.458093,16500,12,0,2,0,2
3,0,1.023988,36,0,0.203108,2766,4,0,0,0,3
4,0,0.966603,61,1,0.350550,3000,4,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2400,0,0.058012,35,0,0.348095,4041,8,0,2,0,3
2401,0,0.024283,91,2,0.118773,5800,7,0,1,0,0
2402,0,0.271156,37,1,0.343996,9127,17,0,1,0,1
2403,0,0.923689,35,0,1.890141,2129,9,0,2,0,0


In [167]:
# normalize the dataset using MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(data)
data = pd.DataFrame(data = X, columns = data.columns)
data

Unnamed: 0,Delinquency,Revolving Credit Percentage,Capital Reserves,Num Late 60,Debt Ratio,Monthly Income,Num Credit Lines,Num Late Past 90,Num Real Estate,Num Late 90,Num Employees
0,0.0,7.537834e-05,0.432432,0.000000,0.000063,0.083333,0.214286,0.0,0.052632,0.0,0.285714
1,0.0,7.245282e-06,0.581081,0.000000,0.000085,0.087500,0.261905,0.0,0.105263,0.0,0.000000
2,0.0,3.968078e-07,0.513514,0.000000,0.000120,0.137500,0.285714,0.0,0.105263,0.0,0.285714
3,0.0,1.974524e-04,0.202703,0.000000,0.000053,0.023050,0.095238,0.0,0.000000,0.0,0.428571
4,0.0,1.863871e-04,0.540541,0.010204,0.000092,0.025000,0.095238,0.0,0.052632,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
2400,0.0,1.118625e-05,0.189189,0.000000,0.000091,0.033675,0.190476,0.0,0.105263,0.0,0.428571
2401,0.0,4.682493e-06,0.945946,0.020408,0.000031,0.048333,0.166667,0.0,0.052632,0.0,0.000000
2402,0.0,5.228615e-05,0.216216,0.010204,0.000090,0.076058,0.404762,0.0,0.052632,0.0,0.142857
2403,0.0,1.781120e-04,0.189189,0.000000,0.000495,0.017742,0.214286,0.0,0.105263,0.0,0.000000


In [168]:
#Dealing with the imbalance issue with SMOTE
from imblearn.over_sampling import RandomOverSampler, SMOTE

delinquency = data['Delinquency']

sm = SMOTE(random_state = 33)
data, delinquency = sm.fit_resample(data, delinquency)
(n,p) = data.shape
print (n,p)

4496 11


In [169]:
#Remove label from data
labels = data['Delinquency']
X = data.iloc[:,1:p]
X

Unnamed: 0,Revolving Credit Percentage,Capital Reserves,Num Late 60,Debt Ratio,Monthly Income,Num Credit Lines,Num Late Past 90,Num Real Estate,Num Late 90,Num Employees
0,7.537834e-05,0.432432,0.000000,0.000063,0.083333,0.214286,0.000000,0.052632,0.000000,0.285714
1,7.245282e-06,0.581081,0.000000,0.000085,0.087500,0.261905,0.000000,0.105263,0.000000,0.000000
2,3.968078e-07,0.513514,0.000000,0.000120,0.137500,0.285714,0.000000,0.105263,0.000000,0.285714
3,1.974524e-04,0.202703,0.000000,0.000053,0.023050,0.095238,0.000000,0.000000,0.000000,0.428571
4,1.863871e-04,0.540541,0.010204,0.000092,0.025000,0.095238,0.000000,0.052632,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
4491,2.525121e-04,0.660205,0.020274,0.000286,0.049510,0.284774,0.000537,0.367035,0.010204,0.000000
4492,1.925800e-04,0.188908,0.020337,0.000166,0.036018,0.118553,0.030400,0.052267,0.010133,0.142857
4493,3.914688e-07,0.121787,0.000042,0.000036,0.034175,0.095530,0.030487,0.000430,0.010162,0.142857
4494,6.448980e-05,0.268249,0.002893,0.000148,0.099166,0.407951,0.000000,0.128054,0.000000,0.000000


In [170]:
#80-20 split
train_data, test_data, train_data_delinquency, test_data_delinquency = train_test_split(X, labels,test_size=0.2, random_state=4,shuffle = True)#

print (train_data.shape, test_data.shape)

(3596, 10) (900, 10)


In [171]:
features_set = train_data
labels = train_data_delinquency

In [172]:
labels

3822    1.0
2171    0.0
523     0.0
3692    1.0
2001    1.0
       ... 
3671    1.0
709     0.0
2487    1.0
174     0.0
1146    0.0
Name: Delinquency, Length: 3596, dtype: float64

In [173]:
features_set

Unnamed: 0,Revolving Credit Percentage,Capital Reserves,Num Late 60,Debt Ratio,Monthly Income,Num Credit Lines,Num Late Past 90,Num Real Estate,Num Late 90,Num Employees
3822,0.000086,0.337285,0.007757,0.000219,0.053596,0.357143,0.000000,0.132656,0.000000,0.000000
2171,0.000021,0.202703,0.000000,0.000103,0.040000,0.119048,0.000000,0.052632,0.000000,0.428571
523,0.000006,0.445946,0.000000,0.000011,0.087500,0.214286,0.000000,0.000000,0.000000,0.285714
3692,0.000158,0.194501,0.016220,0.000048,0.030678,0.085466,0.002005,0.020687,0.012209,0.571429
2001,0.000185,0.378378,0.000000,0.000171,0.067358,0.285714,0.000000,0.105263,0.000000,0.142857
...,...,...,...,...,...,...,...,...,...,...
3671,0.000114,0.338155,0.001438,0.000110,0.037418,0.143976,0.011163,0.105263,0.000240,0.142857
709,0.000193,0.635135,0.000000,0.000010,0.061850,0.095238,0.000000,0.000000,0.000000,0.000000
2487,0.000095,0.391022,0.009547,0.000108,0.033316,0.162070,0.000657,0.049244,0.000000,0.000000
174,0.000078,0.243243,0.000000,0.000165,0.028900,0.238095,0.000000,0.052632,0.000000,0.285714


In [174]:
features_set, labels = np.array(features_set), np.array(labels)

In [175]:
#Converting the data into the 3-D Format accepted by the LSTM
features_set = np.reshape(features_set, (features_set.shape[0], features_set.shape[1], 1))
features_set.shape

(3596, 10, 1)

In [176]:
#Creating the model
model = Sequential()
#Creating LSTM and Dropout Layers
model.add(LSTM(units=50, return_sequences=True, input_shape=(features_set.shape[1], 1)))
model.add(Dropout(0.2))

model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(units=50))
model.add(Dropout(0.2))

model.add(Dense(units = 1))

In [177]:
#model Compilation
model.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [178]:
#Training
model.fit(features_set, labels, epochs = 100, batch_size = 32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x1451881c0>

In [179]:
#convert the test data into the LSTM format
test_data, test_data_delinquency = np.array(test_data), np.array(test_data_delinquency)

test_data = np.reshape(test_data, (test_data.shape[0], test_data.shape[1], 1))
test_data.shape

(900, 10, 1)

In [180]:
predictedDelinquency = model.predict(test_data)

predictedDelinquency.shape

(900, 1)

In [181]:
test_data_delinquency.shape

(900,)

In [182]:
from sklearn.metrics import r2_score

r2 = r2_score(test_data_delinquency, predictedDelinquency)

In [183]:
r2

0.1400230338147973

In [184]:
#Dataset 2

In [185]:
data = read_csv(filename[1])
data

Unnamed: 0,Working capital / Total Assets (WC_TA),Retained Earnings / Total Assets (RE_TA),Earnings Before Interests and Taxes / Total Assets (EBIT_TA),Market Value of Equity / Book Value of Total Debt (MVE_BVTD),Sales / Total Assets (S_TA),Industry sector labels from 1-12,Credit status
0,0.121,0.413,0.057,3.647,0.466,12.0,1.0
1,0.470,0.752,0.070,11.596,1.120,8.0,1.0
2,0.110,0.337,0.045,3.835,0.812,4.0,1.0
3,0.511,0.869,0.106,8.538,0.732,2.0,1.0
4,0.202,0.460,0.064,3.685,0.682,4.0,1.0
...,...,...,...,...,...,...,...
1709,,,,,,,
1710,,,,,,,
1711,,,,,,,
1712,,,,,,,


In [186]:
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(data)
data = pd.DataFrame(data = X, columns = data.columns)
data

Unnamed: 0,Working capital / Total Assets (WC_TA),Retained Earnings / Total Assets (RE_TA),Earnings Before Interests and Taxes / Total Assets (EBIT_TA),Market Value of Equity / Book Value of Total Debt (MVE_BVTD),Sales / Total Assets (S_TA),Industry sector labels from 1-12,Credit status
0,0.765276,0.731264,0.808989,0.030447,0.062000,1.000000,1.0
1,0.878112,0.798299,0.825218,0.097230,0.155429,0.636364,1.0
2,0.761720,0.716235,0.794007,0.032026,0.111429,0.272727,1.0
3,0.891368,0.821436,0.870162,0.071538,0.100000,0.090909,1.0
4,0.791465,0.740558,0.817728,0.030766,0.092857,0.272727,1.0
...,...,...,...,...,...,...,...
1709,,,,,,,
1710,,,,,,,
1711,,,,,,,
1712,,,,,,,


In [187]:
data = data.dropna() #dropping NaN's
data

Unnamed: 0,Working capital / Total Assets (WC_TA),Retained Earnings / Total Assets (RE_TA),Earnings Before Interests and Taxes / Total Assets (EBIT_TA),Market Value of Equity / Book Value of Total Debt (MVE_BVTD),Sales / Total Assets (S_TA),Industry sector labels from 1-12,Credit status
0,0.765276,0.731264,0.808989,0.030447,0.062000,1.000000,1.0
1,0.878112,0.798299,0.825218,0.097230,0.155429,0.636364,1.0
2,0.761720,0.716235,0.794007,0.032026,0.111429,0.272727,1.0
3,0.891368,0.821436,0.870162,0.071538,0.100000,0.090909,1.0
4,0.791465,0.740558,0.817728,0.030766,0.092857,0.272727,1.0
...,...,...,...,...,...,...,...
1665,0.706111,0.509788,0.780275,0.000706,0.023571,0.909091,0.0
1666,0.446169,0.510777,0.600499,0.009124,0.046714,1.000000,0.0
1667,0.678629,0.628436,0.744070,0.003159,0.019714,0.727273,0.0
1668,0.656644,0.453233,0.731586,0.001319,0.006143,0.454545,0.0


In [188]:
#since the data is imbalanced
#using SMOTE to fix the problem
creditstatus = data['Credit status']
sm = SMOTE(random_state = 33)
data, creditstatus = sm.fit_resample(data, creditstatus)

print (data.shape, creditstatus.shape)

(3080, 7) (3080,)


In [189]:
#Remove label from data
(n,p) = data.shape
X = data.iloc[:, :p-1]
X.shape

(3080, 6)

In [190]:
#Splitting the data- 80-20 split
train_data, test_data, train_data_creditstatus, test_data_creditstatus = train_test_split(X, creditstatus,test_size=0.2, random_state=42)

print (train_data.shape)

(2464, 6)


In [191]:
features_set = train_data
labels = train_data_creditstatus
features_set, labels = np.array(features_set), np.array(labels)

features_set

array([[6.97816871e-01, 5.16685799e-01, 7.46101360e-01, 5.60042142e-04,
        8.46711602e-03, 8.18181818e-01],
       [7.02694511e-01, 4.89577252e-01, 7.69053093e-01, 6.44720391e-04,
        3.62587067e-03, 2.72727273e-01],
       [6.83810853e-01, 5.34633805e-01, 7.43925100e-01, 1.61915625e-03,
        1.17908068e-02, 7.27272727e-01],
       ...,
       [8.46750727e-01, 7.39766660e-01, 8.42696629e-01, 1.33751166e-02,
        3.10000000e-02, 9.09090909e-02],
       [8.07953443e-01, 7.37789203e-01, 8.20224719e-01, 1.18880590e-02,
        3.88571429e-02, 9.09090909e-02],
       [8.23472357e-01, 7.38184694e-01, 8.21473159e-01, 2.44902417e-02,
        5.12857143e-02, 2.72727273e-01]])

In [192]:
#Converting the data into the 3-D Format accepted by the LSTM
features_set = np.reshape(features_set, (features_set.shape[0], features_set.shape[1], 1))
features_set.shape

(2464, 6, 1)

In [193]:
#Creating the model
model = Sequential()
#Creating LSTM and Dropout Layers
model.add(LSTM(units=50, return_sequences=True, input_shape=(features_set.shape[1], 1)))
model.add(Dropout(0.2))

model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(units=50))
model.add(Dropout(0.2))

model.add(Dense(units = 1))

In [194]:
#model Compilation
model.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [195]:
#Training
model.fit(features_set, labels, epochs = 100, batch_size = 32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x14a437250>

In [196]:
#convert the test data into the LSTM format
test_data, test_data_creditstatus = np.array(test_data), np.array(test_data_creditstatus)

test_data = np.reshape(test_data, (test_data.shape[0], test_data.shape[1], 1))
test_data.shape

(616, 6, 1)

In [197]:
predictedCreditStatus = model.predict(test_data)

predictedCreditStatus.shape

(616, 1)

In [198]:
from sklearn.metrics import r2_score

r2 = r2_score(test_data_creditstatus, predictedCreditStatus)

In [199]:
r2

0.9685213162444034