# Loading and preparing the data

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import os
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
data = pd.read_pickle('data/clean_lc_data.pkl')
data.head()

Unnamed: 0_level_0,loan_amnt,term,int_rate,installment,sub_grade,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,chargeoff_within_12_mths,delinq_amnt,mort_acc,pub_rec_bankruptcies,tax_liens,total_bal_ex_mort,total_bc_limit,hardship_flag,debt_settlement_flag,loan_status_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1075358,3000,60 months,0.1269,67.79,B5,RENT,80000.0,Source Verified,Fully Paid,n,...,0.0,0.0,3.0,0.0,0.0,54644.0,17850.0,N,N,0
1071570,5375,60 months,0.1269,121.45,B5,RENT,15000.0,Verified,Charged Off,n,...,0.0,0.0,0.0,0.0,0.0,30612.0,21533.0,N,N,1
1069057,10000,36 months,0.1065,325.74,B2,RENT,100000.0,Source Verified,Charged Off,n,...,0.0,0.0,2.0,0.0,0.0,14358.0,17700.0,N,N,1
1069742,9200,36 months,0.0603,280.01,A1,RENT,77385.19,Not Verified,Fully Paid,n,...,0.0,0.0,2.0,0.0,0.0,42247.0,15142.0,N,N,0
1069559,6000,36 months,0.1171,198.46,B3,RENT,76000.0,Not Verified,Charged Off,n,...,0.0,0.0,0.0,0.0,0.0,43192.0,10350.0,N,N,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96683 entries, 1075358 to 129834559
Data columns (total 49 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   96683 non-null  int64  
 1   term                        96683 non-null  object 
 2   int_rate                    96683 non-null  float64
 3   installment                 96683 non-null  float64
 4   sub_grade                   96683 non-null  object 
 5   home_ownership              96683 non-null  object 
 6   annual_inc                  96683 non-null  float64
 7   verification_status         96683 non-null  object 
 8   loan_status                 96683 non-null  object 
 9   pymnt_plan                  96683 non-null  object 
 10  purpose                     96683 non-null  object 
 11  zip_code                    96683 non-null  object 
 12  addr_state                  96683 non-null  object 
 13  dti                  

In [4]:
data.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,fico_range_low,fico_range_high,inq_last_6mths,open_acc,...,acc_now_delinq,acc_open_past_24mths,chargeoff_within_12_mths,delinq_amnt,mort_acc,pub_rec_bankruptcies,tax_liens,total_bal_ex_mort,total_bc_limit,loan_status_num
count,96683.0,96683.0,96683.0,96683.0,96683.0,96683.0,96683.0,96683.0,96683.0,96683.0,...,96683.0,96683.0,96683.0,96683.0,96683.0,96683.0,96683.0,96683.0,96683.0,96683.0
mean,14462.013746,0.134706,434.24661,75196.58,18.213519,0.264328,701.571579,705.571828,0.639306,11.09979,...,0.002648,4.201256,0.00692,9.814911,1.535389,0.107599,0.027523,47309.32,22484.894025,0.150957
std,9056.817085,0.048997,265.112617,67145.29,17.523942,0.789277,33.951358,33.952411,0.92373,5.287587,...,0.060452,2.898863,0.09693,663.051096,1.875855,0.319343,0.260915,46111.11,21737.858779,0.358009
min,1000.0,0.0531,7.61,40.0,0.0,0.0,660.0,664.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7550.0,0.0993,242.83,45000.0,11.19,0.0,675.0,679.0,0.0,7.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,19912.5,8500.0,0.0
50%,12000.0,0.1299,371.68,64000.0,17.03,0.0,695.0,699.0,0.0,10.0,...,0.0,4.0,0.0,0.0,1.0,0.0,0.0,35225.0,16091.0,0.0
75%,20000.0,0.1602,572.35,90000.0,23.46,0.0,720.0,724.0,1.0,14.0,...,0.0,6.0,0.0,0.0,2.0,0.0,0.0,58945.0,29200.0,0.0
max,40000.0,0.3099,1719.83,7200000.0,999.0,30.0,845.0,850.0,8.0,55.0,...,7.0,46.0,8.0,138474.0,31.0,5.0,13.0,1276247.0,462200.0,1.0


In [5]:
data.drop('loan_status', axis=1, inplace=True)

For the machine learning algorithms to be able to work on the data, the categorical columns have to be dealt with.

In [6]:
print([column for column in data.columns if data[column].dtype == object])

['term', 'sub_grade', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose', 'zip_code', 'addr_state', 'revol_util', 'initial_list_status', 'application_type', 'hardship_flag', 'debt_settlement_flag']


In [7]:
data['term']=data.term.map({' 36 months': 36, ' 60 months': 60})

In [8]:
data['revol_util']=data['revol_util'].str[:-1].astype(float)/100

In [9]:
dummies = ['sub_grade', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose', 'zip_code', 'addr_state', 'initial_list_status', 'application_type', 'hardship_flag', 'debt_settlement_flag']
data = pd.get_dummies(data, columns=dummies, drop_first=True)

In [10]:
data.head()

Unnamed: 0_level_0,loan_amnt,term,int_rate,installment,annual_inc,dti,delinq_2yrs,fico_range_low,fico_range_high,inq_last_6mths,...,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY,initial_list_status_w,application_type_Joint App,hardship_flag_Y,debt_settlement_flag_Y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1075358,3000,60,0.1269,67.79,80000.0,17.94,0.0,695,699,0.0,...,0,0,0,0,0,0,0,0,0,0
1071570,5375,60,0.1269,121.45,15000.0,18.08,0.0,725,729,0.0,...,0,0,0,0,0,0,0,0,0,0
1069057,10000,36,0.1065,325.74,100000.0,7.06,0.0,720,724,2.0,...,0,0,0,0,0,0,0,0,0,0
1069742,9200,36,0.0603,280.01,77385.19,9.86,0.0,755,759,0.0,...,0,0,0,0,0,0,0,0,0,0
1069559,6000,36,0.1171,198.46,76000.0,2.4,0.0,690,694,1.0,...,0,0,0,0,0,0,0,0,0,0


# Preparing the data for the models

In [11]:
X = data.drop(columns=['loan_status_num'])
y = data['loan_status_num']
X_train_extra, X_test_extra, y_train_extra, y_test_extra = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
#X = torch.tensor(X.values, dtype=torch.float32)
#y = torch.tensor(y.values, dtype=torch.float32).reshape(len(y),1)

In [12]:
# Split data into training (70%) and temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split temp into validation (10%) and temp2 (20%)
X_validation, X_temp2, y_validation, y_temp2 = train_test_split(X_temp, y_temp, test_size=0.67, random_state=42)

# Split temp2 into calibration (10%) and test (10%)
X_calibration, X_test, y_calibration, y_test = train_test_split(X_temp2, y_temp2, test_size=0.5, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_validation_scaled = scaler.transform(X_validation)
X_test_scaled = scaler.transform(X_test) 

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_validation_tensor = torch.tensor(X_validation_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).reshape(len(y_train),1)
y_validation_tensor = torch.tensor(y_validation.values, dtype=torch.float32).reshape(len(y_validation),1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).reshape(len(y_test),1)

In [13]:
print(X_train_tensor.shape)
print(y_train_tensor.shape)
print(X_validation_tensor.shape)
print(y_validation_tensor.shape)
print(X_test_tensor.shape)
print(y_test_tensor.shape)

torch.Size([67678, 1013])
torch.Size([67678, 1])
torch.Size([9571, 1013])
torch.Size([9571, 1])
torch.Size([9717, 1013])
torch.Size([9717, 1])


In [14]:
from sklearn.tree import DecisionTreeClassifier, export_text
# Create an instance of DecisionTreeClassifier
dt_model = DecisionTreeClassifier(max_depth=2)
# Fit the model using X_train and y_train
dt_model.fit(X_train_extra, y_train_extra)
# Get the rules of the decision tree
tree_rules = export_text(dt_model, feature_names=list(X_train_extra.columns))
print(tree_rules)
# Use the trained model to make predictions
predictions_extra = dt_model.predict(X_test_extra)

class_report = classification_report(y_test_extra, predictions_extra)
print("Classification Report:\n", class_report)


|--- last_fico_range_high <= 631.50
|   |--- last_pymnt_amnt <= 1502.83
|   |   |--- class: 1
|   |--- last_pymnt_amnt >  1502.83
|   |   |--- class: 0
|--- last_fico_range_high >  631.50
|   |--- total_rec_prncp <= 999.98
|   |   |--- class: 1
|   |--- total_rec_prncp >  999.98
|   |   |--- class: 0

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.93      0.95     24584
           1       0.68      0.82      0.74      4421

    accuracy                           0.91     29005
   macro avg       0.82      0.87      0.85     29005
weighted avg       0.92      0.91      0.92     29005


As there is a significant class imbalance in the dataset, the weights have to be calculated for the classification.

In [49]:
classlabels = torch.tensor(data['loan_status_num'].values, dtype=torch.long)
class_counts = torch.bincount(classlabels)

# Compute inverse class frequencies
class_weights_original = 1.0 / class_counts.float()

# Normalize class weights
class_weights_original /= class_weights_original.sum()
class_weights = class_weights_original[y_train_tensor.long()]


Preparing evaluation functions to see the models' results.

In [16]:
def evaluate_nn(true, pred, train=True):
    if train:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")

    elif train==False:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Test Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")

def plot_learning_evolution(r):
    plt.figure(figsize=(12, 8))

    plt.subplot(2, 2, 1)
    plt.plot(r.history['loss'], label='Loss')
    plt.plot(r.history['val_loss'], label='val_Loss')
    plt.title('Loss evolution during trainig')
    plt.legend()

    plt.subplot(2, 2, 2)
    plt.plot(r.history['AUC'], label='AUC')
    plt.plot(r.history['val_AUC'], label='val_AUC')
    plt.title('AUC score evolution during trainig')
    plt.legend();


# Setting up the baseline

In [17]:
from models.model import LogisticRegression
from torchinfo import summary

In [18]:
#calculating number of features
n_features=len(data.drop(columns=['loan_status_num']).columns)
n_features

1013

In [116]:
lr_model = LogisticRegression(num_features=n_features)
summary(lr_model, input_size=X_train.shape)

Layer (type:depth-idx)                   Output Shape              Param #
LogisticRegression                       [67678, 1]                --
├─Sequential: 1-1                        [67678, 1]                --
│    └─Linear: 2-1                       [67678, 1]                1,014
│    └─Sigmoid: 2-2                      [67678, 1]                --
Total params: 1,014
Trainable params: 1,014
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 68.63
Input size (MB): 274.23
Forward/backward pass size (MB): 0.54
Params size (MB): 0.00
Estimated Total Size (MB): 274.78

In [117]:
LEARNING_RATE = 0.01
EPOCHS = 4000

In [118]:
loss_function = nn.BCELoss(weight=class_weights)

In [119]:
optimizer = optim.SGD(lr_model.parameters(), lr=LEARNING_RATE)

In [23]:
def calculate_accuracy(preds, actuals):

    with torch.no_grad():
        rounded_preds = torch.round(preds)
        num_correct = torch.sum(rounded_preds == actuals)
        accuracy = num_correct/len(preds)

    return accuracy

In [120]:
train_losses = []
#test_losses  = []
train_accs = []
test_accs  = []

for epoch in range(EPOCHS):
    # Forward propagation (predicting train data) #a
    train_preds = lr_model(X_train_tensor)
    train_loss  = loss_function(train_preds, y_train_tensor)

    # Predicting test data #b
    with torch.no_grad():
        test_preds = lr_model(X_validation_tensor)
        #test_loss  = loss_function(test_preds, y_test)

    # Calculate accuracy #c
    train_acc = calculate_accuracy(train_preds, y_train_tensor)
    test_acc  = calculate_accuracy(test_preds, y_validation_tensor)

    # Backward propagation #d
    optimizer.zero_grad()
    train_loss.backward()

    # Gradient descent step #e
    optimizer.step()

    # Store training history #f
    train_losses.append(train_loss.item())
    #test_losses.append(test_loss.item())
    train_accs.append(train_acc.item())
    test_accs.append(test_acc.item())

    # Print training data #g
    if epoch%100==0:
        print(f'Epoch: {epoch} \t|' \
              f' Train loss: {np.round(train_loss.item(),3)} \t|' \
              #f' Test loss: {np.round(test_loss.item(),3)} \t|' \
              f' Train acc: {np.round(train_acc.item(),2)} \t|' \
              f' Test acc: {np.round(test_acc.item(),2)}')

Epoch: 0 	| Train loss: 0.188 	| Train acc: 0.51 	| Test acc: 0.5
Epoch: 100 	| Train loss: 0.158 	| Train acc: 0.61 	| Test acc: 0.61
Epoch: 200 	| Train loss: 0.141 	| Train acc: 0.66 	| Test acc: 0.66
Epoch: 300 	| Train loss: 0.13 	| Train acc: 0.69 	| Test acc: 0.68
Epoch: 400 	| Train loss: 0.122 	| Train acc: 0.71 	| Test acc: 0.7
Epoch: 500 	| Train loss: 0.116 	| Train acc: 0.72 	| Test acc: 0.72
Epoch: 600 	| Train loss: 0.111 	| Train acc: 0.74 	| Test acc: 0.73
Epoch: 700 	| Train loss: 0.107 	| Train acc: 0.75 	| Test acc: 0.75
Epoch: 800 	| Train loss: 0.103 	| Train acc: 0.77 	| Test acc: 0.76
Epoch: 900 	| Train loss: 0.1 	| Train acc: 0.78 	| Test acc: 0.77
Epoch: 1000 	| Train loss: 0.098 	| Train acc: 0.79 	| Test acc: 0.78
Epoch: 1100 	| Train loss: 0.095 	| Train acc: 0.8 	| Test acc: 0.79
Epoch: 1200 	| Train loss: 0.093 	| Train acc: 0.81 	| Test acc: 0.8
Epoch: 1300 	| Train loss: 0.091 	| Train acc: 0.81 	| Test acc: 0.8
Epoch: 1400 	| Train loss: 0.089 	| Trai

In [121]:
evaluate_nn(y_train_tensor.clone().detach(), train_preds.clone().detach().round(), train=True)
evaluate_nn(y_validation_tensor.clone().detach(), test_preds.clone().detach().round(), train=False)



Train Result:
Accuracy Score: 90.43%
_______________________________________________
CLASSIFICATION REPORT:
                    0.0           1.0  accuracy     macro avg  weighted avg
precision      0.992320      0.616705  0.904341      0.804512      0.935854
recall         0.894338      0.960881  0.904341      0.927609      0.904341
f1-score       0.940785      0.751249  0.904341      0.846017      0.912292
support    57504.000000  10174.000000  0.904341  67678.000000  67678.000000
_______________________________________________
Confusion Matrix: 
 [[51428  6076]
 [  398  9776]]

Test Result:
Accuracy Score: 89.54%
_______________________________________________
CLASSIFICATION REPORT:
                   0.0          1.0  accuracy    macro avg  weighted avg
precision     0.990359     0.596970  0.895413     0.793665      0.930802
recall        0.885373     0.951691  0.895413     0.918532      0.895413
f1-score      0.934928     0.733706  0.895413     0.834317      0.904464
support    81

# Implementing semantic loss

In [109]:
from importlib import reload  # Python 3.4+
import models.loss
reload(models.loss)
from models.loss import semantic_loss

In [110]:
sl_model = LogisticRegression(num_features=n_features)
summary(sl_model, input_size=X_train.shape)

Layer (type:depth-idx)                   Output Shape              Param #
LogisticRegression                       [67678, 1]                --
├─Sequential: 1-1                        [67678, 1]                --
│    └─Linear: 2-1                       [67678, 1]                1,014
│    └─Sigmoid: 2-2                      [67678, 1]                --
Total params: 1,014
Trainable params: 1,014
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 68.63
Input size (MB): 274.23
Forward/backward pass size (MB): 0.54
Params size (MB): 0.00
Estimated Total Size (MB): 274.78

In [111]:
optimizer_sl = optim.SGD(sl_model.parameters(), lr=LEARNING_RATE)

In [112]:
last_fico = torch.tensor(X_train['last_fico_range_high'].values, dtype=torch.float32).reshape(len(y_train),1)
last_pymnt = torch.tensor(X_train['last_pymnt_amnt'].values, dtype=torch.float32).reshape(len(y_train),1)
total_rec = torch.tensor(X_train['total_rec_prncp'].values, dtype=torch.float32).reshape(len(y_train),1)

rule = torch.logical_or(
    torch.logical_and(last_fico <= 631.5, last_pymnt <= 1502.83),
    torch.logical_and(last_fico > 631.5, total_rec <= 999.98)
).float()

In [113]:
print(rule)

tensor([[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [1.],
        [0.]])


In [114]:
train_losses = []
#test_losses  = []
train_accs = []
test_accs  = []

for epoch in range(EPOCHS):
    # Forward propagation (predicting train data) #a
    train_preds = sl_model(X_train_tensor)
    #train_loss  = loss_function(train_preds, y_train_tensor)
    train_loss = semantic_loss(train_preds, y_train_tensor, rule, class_weights, 0.05)

    # Predicting test data #b
    with torch.no_grad():
        test_preds = sl_model(X_validation_tensor)
        #test_loss  = loss_function(test_preds, y_test)

    # Calculate accuracy #c
    train_acc = calculate_accuracy(train_preds, y_train_tensor)
    test_acc  = calculate_accuracy(test_preds, y_validation_tensor)

    # Backward propagation #d
    optimizer_sl.zero_grad()
    train_loss.backward()

    # Gradient descent step #e
    optimizer_sl.step()

    # Store training history #f
    train_losses.append(train_loss.item())
    #test_losses.append(test_loss.item())
    train_accs.append(train_acc.item())
    test_accs.append(test_acc.item())

    # Print training data #g
    if epoch%100==0:
        print(f'Epoch: {epoch} \t|' \
              f' Train loss: {np.round(train_loss.item(),3)} \t|' \
                  #f' Test loss: {np.round(test_loss.item(),3)} \t|' \
              f' Train acc: {np.round(train_acc.item(),2)} \t|' \
              f' Test acc: {np.round(test_acc.item(),2)}')

Epoch: 0 	| Train loss: -0.205 	| Train acc: 0.49 	| Test acc: 0.49
Epoch: 100 	| Train loss: -0.235 	| Train acc: 0.6 	| Test acc: 0.6
Epoch: 200 	| Train loss: -0.249 	| Train acc: 0.66 	| Test acc: 0.65
Epoch: 300 	| Train loss: -0.257 	| Train acc: 0.7 	| Test acc: 0.69
Epoch: 400 	| Train loss: -0.262 	| Train acc: 0.72 	| Test acc: 0.72
Epoch: 500 	| Train loss: -0.267 	| Train acc: 0.75 	| Test acc: 0.75
Epoch: 600 	| Train loss: -0.27 	| Train acc: 0.77 	| Test acc: 0.77
Epoch: 700 	| Train loss: -0.273 	| Train acc: 0.79 	| Test acc: 0.79
Epoch: 800 	| Train loss: -0.276 	| Train acc: 0.81 	| Test acc: 0.81
Epoch: 900 	| Train loss: -0.278 	| Train acc: 0.83 	| Test acc: 0.82
Epoch: 1000 	| Train loss: -0.28 	| Train acc: 0.84 	| Test acc: 0.84
Epoch: 1100 	| Train loss: -0.282 	| Train acc: 0.85 	| Test acc: 0.85
Epoch: 1200 	| Train loss: -0.284 	| Train acc: 0.87 	| Test acc: 0.86
Epoch: 1300 	| Train loss: -0.286 	| Train acc: 0.88 	| Test acc: 0.87
Epoch: 1400 	| Train lo

In [115]:
evaluate_nn(y_train_tensor.clone().detach(), train_preds.clone().detach().round(), train=True)
evaluate_nn(y_validation_tensor.clone().detach(), test_preds.clone().detach().round(), train=False)



Train Result:
Accuracy Score: 94.76%
_______________________________________________
CLASSIFICATION REPORT:
                    0.0           1.0  accuracy     macro avg  weighted avg
precision      0.987268      0.769400  0.947649      0.878334      0.954516
recall         0.950647      0.930706  0.947649      0.940676      0.947649
f1-score       0.968611      0.842400  0.947649      0.905506      0.949638
support    57504.000000  10174.000000  0.947649  67678.000000  67678.000000
_______________________________________________
Confusion Matrix: 
 [[54666  2838]
 [  705  9469]]

Test Result:
Accuracy Score: 94.13%
_______________________________________________
CLASSIFICATION REPORT:
                   0.0          1.0  accuracy    macro avg  weighted avg
precision     0.983995     0.751846  0.941281     0.867920      0.948849
recall        0.946196     0.913734  0.941281     0.929965      0.941281
f1-score      0.964725     0.824922  0.941281     0.894824      0.943560
support    81

# Comparing the models on limited data

In [86]:
X_train_lim, X_remain, y_train_lim, y_remain = train_test_split(X_train, y_train, test_size=0.9, random_state=42)
X_train_lim_scaled = scaler.fit_transform(X_train_lim)
X_train_tensor_lim = torch.tensor(X_train_lim_scaled, dtype=torch.float32)
y_train_tensor_lim = torch.tensor(y_train_lim.values, dtype=torch.float32).reshape(len(y_train_lim),1)
X_train_tensor_lim.shape
y_train_tensor_lim.shape

torch.Size([6767, 1])

In [87]:
last_fico_lim = torch.tensor(X_train_lim['last_fico_range_high'].values, dtype=torch.float32).reshape(len(y_train_lim),1)
last_pymnt_lim = torch.tensor(X_train_lim['last_pymnt_amnt'].values, dtype=torch.float32).reshape(len(y_train_lim),1)
total_rec_lim = torch.tensor(X_train_lim['total_rec_prncp'].values, dtype=torch.float32).reshape(len(y_train_lim),1)

rule_lim = torch.logical_or(
    torch.logical_and(last_fico_lim <= 631.5, last_pymnt_lim <= 1502.83),
    torch.logical_and(last_fico_lim > 631.5, total_rec_lim <= 999.98)
).float()

In [108]:
small_model = LogisticRegression(num_features=n_features)
optimizer_small = optim.SGD(small_model.parameters(), lr=LEARNING_RATE)
class_weights_lim = class_weights_original[y_train_tensor_lim.long()]
loss_function_lim=nn.BCELoss(weight=class_weights_lim)
train_losses = []
#test_losses  = []
train_accs = []
test_accs  = []

for epoch in range(4000):
    # Forward propagation (predicting train data) #a
    train_preds_lim = small_model(X_train_tensor_lim)
    #train_loss  = loss_function(train_preds, y_train_tensor)
    
    #print(torch.min(train_preds_lim), torch.max(train_preds_lim))
    #train_loss = semantic_loss(train_preds_lim, y_train_tensor_lim, rule_lim, class_weights_lim, 0.05)
    train_loss = loss_function_lim(train_preds_lim, y_train_tensor_lim)

    # Predicting test data #b
    with torch.no_grad():
        test_preds = small_model(X_validation_tensor)
        #test_loss  = loss_function(test_preds, y_test)

    # Calculate accuracy #c
    train_acc = calculate_accuracy(train_preds_lim, y_train_tensor_lim)
    test_acc  = calculate_accuracy(test_preds, y_validation_tensor)

    # Backward propagation #d
    optimizer_small.zero_grad()
    train_loss.backward()

    # Gradient descent step #e
    optimizer_small.step()

    # Store training history #f
    train_losses.append(train_loss.item())
    #test_losses.append(test_loss.item())
    train_accs.append(train_acc.item())
    test_accs.append(test_acc.item())

    # Print training data #g
    if epoch%100==0:
        print(f'Epoch: {epoch} \t|' \
              f' Train loss: {np.round(train_loss.item(),3)} \t|' \
                  #f' Test loss: {np.round(test_loss.item(),3)} \t|' \
              f' Train acc: {np.round(train_acc.item(),2)} \t|' \
              f' Test acc: {np.round(test_acc.item(),2)}')

Epoch: 0 	| Train loss: 0.187 	| Train acc: 0.49 	| Test acc: 0.5
Epoch: 100 	| Train loss: 0.154 	| Train acc: 0.6 	| Test acc: 0.6
Epoch: 200 	| Train loss: 0.136 	| Train acc: 0.66 	| Test acc: 0.65
Epoch: 300 	| Train loss: 0.125 	| Train acc: 0.69 	| Test acc: 0.68
Epoch: 400 	| Train loss: 0.117 	| Train acc: 0.71 	| Test acc: 0.7
Epoch: 500 	| Train loss: 0.11 	| Train acc: 0.73 	| Test acc: 0.71
Epoch: 600 	| Train loss: 0.106 	| Train acc: 0.75 	| Test acc: 0.73
Epoch: 700 	| Train loss: 0.101 	| Train acc: 0.76 	| Test acc: 0.74
Epoch: 800 	| Train loss: 0.098 	| Train acc: 0.78 	| Test acc: 0.75
Epoch: 900 	| Train loss: 0.095 	| Train acc: 0.79 	| Test acc: 0.76
Epoch: 1000 	| Train loss: 0.092 	| Train acc: 0.8 	| Test acc: 0.77
Epoch: 1100 	| Train loss: 0.089 	| Train acc: 0.81 	| Test acc: 0.78
Epoch: 1200 	| Train loss: 0.087 	| Train acc: 0.82 	| Test acc: 0.78
Epoch: 1300 	| Train loss: 0.085 	| Train acc: 0.83 	| Test acc: 0.79
Epoch: 1400 	| Train loss: 0.083 	| Tr