In [1]:
#@title Import Relevant Libraries
import pandas as pd
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

print('Imported libraries')

Imported libraries


In [2]:
#@title create model

def create_model(my_learning_rate, my_feature_layer):
  """Create and compile a simple linear regression model."""
  # Most simple tf.keras models are sequential.
  model = tf.keras.models.Sequential()

  # Add the layer containing the feature columns to the model.
  model.add(my_feature_layer)

  # Describe the topography of the model by calling the tf.keras.layers.Dense
  # method once for each layer. We've specified the following arguments:
  #   * units specifies the number of nodes in this layer.
  #   * activation specifies the activation function (Rectified Linear Unit).
  #   * name is just a string that can be useful when debugging.

  # Define the first hidden layer with 20 nodes.   
  model.add(tf.keras.layers.Dense(units=20, 
                                  activation='relu', 
                                  name='Hidden1'))
  
#   # Define the second hidden layer with 12 nodes. 
#   model.add(tf.keras.layers.Dense(units=12, 
#                                   activation='relu', 
#                                   name='Hidden2'))
  
  # Define the output layer.
  model.add(tf.keras.layers.Dense(units=1,  
                                  name='Output'))                              
  
  model.compile(optimizer=tf.keras.optimizers.Adam(lr=my_learning_rate),
                loss="mean_squared_error",
                metrics=[tf.keras.metrics.MeanSquaredError()])

  return model

print('Defined Model Creator')

Defined Model Creator


In [3]:
#@title create model trainer

def train_model(model, dataset, epochs, label_name,
                batch_size=None):
  """Train the model by feeding it data."""

  # Split the dataset into features and label.
  features = {name:np.array(value) for name, value in dataset.items()}
  label = np.array(features.pop(label_name))
  history = model.fit(x=features, y=label, batch_size=batch_size,
                      epochs=epochs, shuffle=True) 

  # The list of epochs is stored separately from the rest of history.
  epochs = history.epoch
  
  # To track the progression of training, gather a snapshot
  # of the model's mean squared error at each epoch. 
  hist = pd.DataFrame(history.history)
  mse = hist["mean_squared_error"]

  return epochs, mse

print('Defined model trainer')

Defined model trainer


In [4]:
#@title Define the plotting function.

def plot_the_loss_curve(epochs, mse):
  """Plot a curve of loss vs. epoch."""

  plt.figure()
  plt.xlabel("Epoch")
  plt.ylabel("Mean Squared Error")

  plt.plot(epochs, mse, label="Loss")
  plt.legend()
  plt.ylim([mse.min()*0.95, mse.max() * 1.03])
  plt.show()  

print("Defined the plot_the_loss_curve function.")

Defined the plot_the_loss_curve function.


In [5]:
#@title Extract dataframe from csv
df_train = pd.read_csv(filepath_or_buffer="./Dataset/credit_train.csv")
df_test = pd.read_csv(filepath_or_buffer="./Dataset/credit_test.csv")

print('Extracted data from csv')

Extracted data from csv


In [6]:

print("############ TRAIN_DATA #############")
df_train.info()
print()
print("############ TEST_DATA #############")
df_test.info()

############ TRAIN_DATA #############
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100514 entries, 0 to 100513
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Loan ID                       100000 non-null  object 
 1   Customer ID                   100000 non-null  object 
 2   Loan Status                   100000 non-null  object 
 3   Current Loan Amount           100000 non-null  float64
 4   Term                          100000 non-null  object 
 5   Credit Score                  80846 non-null   float64
 6   Annual Income                 80846 non-null   float64
 7   Years in current job          95778 non-null   object 
 8   Home Ownership                100000 non-null  object 
 9   Purpose                       100000 non-null  object 
 10  Monthly Debt                  100000 non-null  float64
 11  Years of Credit History       100000 non-null  float64
 12  Months

In [7]:
print("############ TRAIN_DATA #############")
df_train = df_train[df_train['Credit Score'].notna()]
df_train = df_train[df_train['Annual Income'].notna()]
df_train = df_train[df_train['Bankruptcies'].notna()]
df_train = df_train[df_train['Tax Liens'].notna()]
df_train = df_train[df_train['Years in current job'].notna()]
df_train = df_train[df_train['Maximum Open Credit'].notna()]
df_train['Months since last delinquent'].fillna(value=0, inplace=True)
df_train.info()

print()
print("############ TEST_DATA #############")
df_test = df_test[df_test['Credit Score'].notna()]
df_test = df_test[df_test['Annual Income'].notna()]
df_test = df_test[df_test['Bankruptcies'].notna()]
df_test = df_test[df_test['Tax Liens'].notna()]
df_test = df_test[df_test['Years in current job'].notna()]
df_test = df_test[df_test['Maximum Open Credit'].notna()]
df_test['Months since last delinquent'].fillna(value=0, inplace=True)
df_test.drop(columns=['Loan ID', 'Customer ID'])
df_test.info()

############ TRAIN_DATA #############
<class 'pandas.core.frame.DataFrame'>
Int64Index: 77271 entries, 0 to 99998
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Loan ID                       77271 non-null  object 
 1   Customer ID                   77271 non-null  object 
 2   Loan Status                   77271 non-null  object 
 3   Current Loan Amount           77271 non-null  float64
 4   Term                          77271 non-null  object 
 5   Credit Score                  77271 non-null  float64
 6   Annual Income                 77271 non-null  float64
 7   Years in current job          77271 non-null  object 
 8   Home Ownership                77271 non-null  object 
 9   Purpose                       77271 non-null  object 
 10  Monthly Debt                  77271 non-null  float64
 11  Years of Credit History       77271 non-null  float64
 12  Months since last deli

In [8]:
df_train.drop(columns=['Loan ID', 'Customer ID'])
df_train['Loan Status'].replace({'Charged Off':0, 'Fully Paid':1}, inplace=True)
df_train['Term'].replace({'Short Term':0, 'Long Term':1}, inplace=True)
df_train['Years in current job'].replace({'< 1 year':0,
                                          '1 year':1,
                                          '2 years':2,
                                          '3 years':3,
                                          '4 years':4,
                                          '5 years':5,
                                          '6 years':6,
                                          '7 years':7,
                                          '8 years':8,
                                          '9 years':9,
                                          '10+ years': 10},
                                        inplace=True)
df_train['Home Ownership'].replace({'HaveMortgage':1,
                                    'Home Mortgage':2,
                                    'Own Home':3,
                                    'Rent':4},
                                  inplace=True)
df_train['Purpose'].replace({'Business Loan':1,
                             'Buy a Car':2,
                             'Buy House':3,
                             'Debt Consolidation':4,
                             'Educational Expenses':5,
                             'Home Improvements':6,
                             'major_purchase':7,
                             'Medical Bills':8,
                             'moving':9,
                             'renewable_energy':10,
                             'small_business':11,
                             'Take a Trip':12,
                             'vacation':13,
                             'wedding':14,
                             'other':15,
                             'Other':15},
                           inplace=True)
df_train.head()

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,1,445412.0,0,709.0,1167493.0,8,2,6,5214.74,17.2,0.0,6.0,1.0,228190.0,416746.0,1.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,1,99999999.0,0,741.0,2231892.0,8,3,4,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,1,347666.0,1,721.0,806949.0,3,3,4,8741.9,12.0,0.0,9.0,0.0,256329.0,386958.0,0.0,0.0
5,89d8cb0c-e5c2-4f54-b056-48a645c543dd,4ffe99d3-7f2a-44db-afc1-40943f1f9750,0,206602.0,0,7290.0,896857.0,10,2,4,16367.74,17.3,0.0,6.0,0.0,215308.0,272448.0,0.0,0.0
6,273581de-85d8-4332-81a5-19b04ce68666,90a75dde-34d5-419c-90dc-1e58b04b3e35,1,217646.0,0,730.0,1184194.0,0,2,4,10855.08,19.6,10.0,13.0,1.0,122170.0,272052.0,1.0,0.0


In [9]:

df_train.describe()


Unnamed: 0,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
count,77271.0,77271.0,77271.0,77271.0,77271.0,77271.0,77271.0,77271.0,77271.0,77271.0,77271.0,77271.0,77271.0,77271.0,77271.0,77271.0,77271.0
mean,0.790167,14545620.0,0.291623,1071.759392,1399186.0,5.911066,2.934231,5.215204,18755.666828,17.983421,16.42778,11.20102,0.164641,297565.4,739551.0,0.114299,0.029196
std,0.407192,34869350.0,0.454513,1466.273187,1094465.0,3.630256,0.956108,3.352429,12292.178511,6.796231,23.001615,5.037004,0.481352,384575.4,7038667.0,0.34727,0.262642
min,0.0,15422.0,0.0,585.0,76627.0,0.0,1.0,1.0,0.0,3.7,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,193149.0,0.0,705.0,862505.0,3.0,2.0,4.0,10478.5,13.4,0.0,8.0,0.0,114351.5,274626.0,0.0,0.0
50%,1.0,326964.0,0.0,724.0,1202263.0,6.0,3.0,4.0,16475.28,16.8,0.0,10.0,0.0,212724.0,469700.0,0.0,0.0
75%,1.0,553080.0,1.0,741.0,1677339.0,10.0,4.0,4.0,24302.33,21.5,30.0,14.0,0.0,371269.5,783904.0,0.0,0.0
max,1.0,100000000.0,1.0,7510.0,165557400.0,10.0,4.0,15.0,435843.28,70.5,176.0,76.0,15.0,32878970.0,1539738000.0,7.0,15.0


In [10]:
df_test.describe()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
count,7658.0,7658.0,7658.0,7658.0,7658.0,7658.0,7658.0,7658.0,7658.0,7658.0,7658.0,7658.0
mean,14412350.0,1079.829982,1387575.0,18679.219512,17.967185,16.370201,11.127318,0.159441,292883.2,658153.7,0.111387,0.029903
std,34738510.0,1480.910741,874080.2,12255.386142,6.708574,22.922055,5.025167,0.492907,384281.5,998618.0,0.342075,0.285506
min,21472.0,585.0,81092.0,0.0,3.8,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,188529.0,706.0,861806.8,10424.635,13.5,0.0,8.0,0.0,110684.5,272783.5,0.0,0.0
50%,325534.0,724.0,1194958.0,16316.535,16.9,0.0,10.0,0.0,211365.5,462605.0,0.0,0.0
75%,547822.0,741.0,1681186.0,24224.81,21.5,30.0,14.0,0.0,369056.0,785108.5,0.0,0.0
max,100000000.0,7510.0,17815350.0,143526.57,52.5,131.0,55.0,10.0,16237440.0,37527420.0,3.0,9.0


In [11]:
array = df_train.values
print(array)
x = array[3:,3:]
y = array[3:,2]
y = y.astype(int)
print(x)
print(y)

[['14dd8831-6af5-400b-83ec-68e61888a048'
  '981165ec-3274-42f5-a3b4-d104041a9ca9' 1 ... 416746.0 1.0 0.0]
 ['4eed4e6a-aa2f-4c91-8651-ce984ee8fb26'
  '5efb2b2b-bf11-4dfd-a572-3761a2694725' 1 ... 750090.0 0.0 0.0]
 ['77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a'
  'e777faab-98ae-45af-9a86-7ce5b33b1011' 1 ... 386958.0 0.0 0.0]
 ...
 ['06eba04f-58fc-424a-b666-ed72aa008900'
  '77f2252a-b7d1-4b07-a746-1202a8304290' 1 ... 509234.0 0.0 0.0]
 ['e1cb4050-eff5-4bdb-a1b0-aabd3f7eaac7'
  '2ced5f10-bd60-4a11-9134-cadce4e7b0a3' 1 ... 537548.0 1.0 0.0]
 ['81ab928b-d1a5-4523-9a3c-271ebb01b4fb'
  '3e45ffda-99fd-4cfc-b8b8-446f4a505f36' 1 ... 738254.0 0.0 0.0]]
[[206602.0 0 7290.0 ... 272448.0 0.0 0.0]
 [217646.0 0 730.0 ... 272052.0 1.0 0.0]
 [548746.0 0 678.0 ... 555038.0 0.0 0.0]
 ...
 [99999999.0 0 732.0 ... 509234.0 0.0 0.0]
 [103136.0 0 742.0 ... 537548.0 1.0 0.0]
 [530332.0 0 746.0 ... 738254.0 0.0 0.0]]
[0 1 1 ... 1 1 1]


In [12]:
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(x, y)
np.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(x)
print(features[0:5,:])

[2.850e+11 6.092e+02 3.399e+07 1.640e+08 6.615e+00 6.011e+01 2.079e+01
 7.176e+04 1.547e+02 6.729e+02 3.682e+01 2.127e-01 2.628e+06 2.844e+08
 1.018e+01 1.652e+01]
[[206602.0 7290.0 896857.0 272448.0]
 [217646.0 730.0 1184194.0 272052.0]
 [548746.0 678.0 2559110.0 555038.0]
 [215952.0 739.0 1454735.0 1021460.0]
 [99999999.0 728.0 714628.0 289784.0]]


# Highest scores: Current Loan Amount, Credit Score, Annual Income, Maximum Open Credit

In [14]:
df_train_mean = df_train.mean()
df_train_std = df_train.std()
df_train_norm = (df_train - df_train_mean) / df_train_std

TypeError: unsupported operand type(s) for -: 'str' and 'float'

In [None]:
features = []
current_loan_amount = tf.feature_column.numeric_column('Current Loan Amount')
feature_columns.append(current_loan_amount)
credit_score = tf.feature_column.numeric_column('Credit Score')
feature_columns.append(credit_score)
annual_income = tf.feature_column.numeric_column('Annual Income')
feature_columns.append(annual_income)
max_open_credit = tf.feature_column.numeric_column('Maximum Open Credit')
feature_columns.append(max_open_credit)

my_feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [None]:
# The following variables are the hyperparameters.
learning_rate = 0.3
epochs = 50
batch_size = 3000

# Specify the label
label_name = "median_house_value"

# Establish the model's topography.
my_model = create_model(learning_rate, my_feature_layer)

# Train the model on the normalized training set. We're passing the entire
# normalized training set, but the model will only use the features
# defined by the feature_layer.
epochs, mse = train_model(my_model, df_train_norm, epochs, 
                          label_name, batch_size)
plot_the_loss_curve(epochs, mse)

# After building a model against the training set, test that model
# against the test set.
test_features = {name:np.array(value) for name, value in test_df_norm.items()}
test_label = np.array(test_features.pop(label_name)) # isolate the label
print("\n Evaluate the new model against the test set:")
my_model.evaluate(x = test_features, y = test_label, batch_size=batch_size)