### Model story

See the readme.md please

In [10]:
    # General and ML libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
import os
import sys
import warnings 

    # Libraries required to make the deep learing model using PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

    # Add the absolute pass of the package to the system path
sys.path.append(os.path.abspath(".."))
project_root = os.path.abspath("..")

pd.set_option('display.max_columns', None)
warnings.filter_warnings('ignore')


In [156]:
### 1. Preliminary analysis: Load the data and investigate it to find irrelevant variables, missing data, ...
# Note: we changed the original data file name from SBAcase.11.13.17.csv to SBAcase_data.csv
file_path = os.path.join(project_root, "data/")
df = pd.read_csv(file_path + "SBAcase.11.13.17.csv")
df.head()

Unnamed: 0,Selected,LoanNr_ChkDgt,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,ChgOffDate,DisbursementDate,DisbursementGross,BalanceGross,MIS_Status,ChgOffPrinGr,GrAppv,SBA_Appv,New,RealEstate,Portion,Recession,daysterm,xx,Default
0,0,1004285007,SIMPLEX OFFICE SOLUTIONS,ANAHEIM,CA,92801,CALIFORNIA BANK & TRUST,CA,532420,15074,2001,36,1,1.0,0,0,1,0,Y,N,,15095.0,32812,0,P I F,0,30000,15000,0,0,0.5,0,1080,16175.0,0
1,1,1004535010,DREAM HOME REALTY,TORRANCE,CA,90505,CALIFORNIA BANK & TRUST,CA,531210,15130,2001,56,1,1.0,0,0,1,0,Y,N,,15978.0,30000,0,P I F,0,30000,15000,0,0,0.5,1,1680,17658.0,0
2,0,1005005006,"Winset, Inc. dba Bankers Hill",SAN DIEGO,CA,92103,CALIFORNIA BANK & TRUST,CA,531210,15188,2001,36,10,1.0,0,0,1,0,Y,N,,15218.0,30000,0,P I F,0,30000,15000,0,0,0.5,0,1080,16298.0,0
3,1,1005535001,Shiva Management,SAN DIEGO,CA,92108,CALIFORNIA BANK & TRUST,CA,531312,15719,2003,36,6,1.0,0,0,1,0,Y,N,,15736.0,50000,0,P I F,0,50000,25000,0,0,0.5,0,1080,16816.0,0
4,1,1005996006,"GOLD CROWN HOME LOANS, INC",LOS ANGELES,CA,91345,SBA - EDF ENFORCEMENT ACTION,CO,531390,16840,2006,240,65,1.0,3,65,1,1,0,N,,16903.0,343000,0,P I F,0,343000,343000,0,1,1.0,0,7200,24103.0,0


In [139]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2102 entries, 0 to 2101
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Selected           2102 non-null   int64  
 1   LoanNr_ChkDgt      2102 non-null   int64  
 2   Name               2102 non-null   object 
 3   City               2102 non-null   object 
 4   State              2102 non-null   object 
 5   Zip                2102 non-null   int64  
 6   Bank               2099 non-null   object 
 7   BankState          2099 non-null   object 
 8   NAICS              2102 non-null   int64  
 9   ApprovalDate       2102 non-null   int64  
 10  ApprovalFY         2102 non-null   int64  
 11  Term               2102 non-null   int64  
 12  NoEmp              2102 non-null   int64  
 13  NewExist           2101 non-null   float64
 14  CreateJob          2102 non-null   int64  
 15  RetainedJob        2102 non-null   int64  
 16  FranchiseCode      2102 

In [140]:
# Except ChgOffDate (The date when a loan is declared to be in default), there are not many missing values.
# Let's see whether a missing ChgOffDate always means PIF (Paid in full) in the target (MIS_Status)

df[df.MIS_Status == 'PIF'].ChgOffDate.isna().sum()

np.int64(0)

In [141]:
# Consistency of catgorical variable: These variables must only have two values: Y and N

df.RevLineCr.value_counts(normalize = True)

RevLineCr
Y    0.350952
0    0.347143
N    0.276190
T    0.025714
Name: proportion, dtype: float64

In [142]:
df.LowDoc.value_counts(normalize = True)

LowDoc
N    0.978085
Y    0.019533
S    0.001429
A    0.000476
0    0.000476
Name: proportion, dtype: float64

### Conclusion of the preliminary data analysis:

1. Except ChgOffDate (The date when a loan is declared to be in default), there are not many missing values.
2. Some variables can be removed for the model. they are: "Name", "City", "Bank", "BankState", "NAICS", "ApprovalDate", "ApprovalFY", "Zip", "State"
3. The target variable MIS_Status, and other categorical variables like RevLineCr and LowDoc need to be encoded.
4. Two categorical valriables have inconsistent values. 0 needs to be converted to N and T should be converted to Y
5. We need to scale numeric variables (not the encoded ones ofcourse). Note that will fit the scaler only on the training data and then will scale train, validation and test data separately.
   

In [143]:
### 2. Data Preparation

# 2.1 Impute missing target values

# Fill emply places for ChgOffDate with a value like -1 to signal that there was no default. For more clarity, we'll create a binary 
# feature like Was_Charged_Off = 1 if ChgOffDate is present, else 0 â€” to help the model learn from the presence/absence of that field. 

df.ChgOffDate.fillna(-1, inplace = True)
df['Was_Charged_Off'] = df.ChgOffDate.apply(lambda x: 0 if x == -1 else 1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.ChgOffDate.fillna(-1, inplace = True)


In [144]:
# 2.2 Make other binary (Y and N) categorical variables consistent. We kn02 0 means N but A or S can't be interpreted.

df.RevLineCr = df.RevLineCr.apply(lambda x: "N" if x == "0" else "Y" if x == "T" else x)
df.RevLineCr.value_counts(normalize = True)

RevLineCr
N    0.623333
Y    0.376667
Name: proportion, dtype: float64

In [145]:
# First convert values to string and remove possible leading or trailing spaces
df.LowDoc = df.LowDoc.astype(str).str.strip()
df.LowDoc = df.LowDoc.apply(lambda x: "N" if x == "0" else float('nan') if x in ["A", "S"] else x)
df.LowDoc.value_counts(normalize = True)

LowDoc
N      0.979028
Y      0.019542
nan    0.001430
Name: proportion, dtype: float64

In [146]:
# 2.3 Pruning

# Drop irrelevant 
df.drop(columns=["Name", "City", "Bank", "BankState", "NAICS", "ApprovalDate", 
                 "ApprovalFY", "Zip", "State"], inplace=True, errors='ignore')

# Drop the few observations with missing column values
df.dropna(inplace=True)

In [147]:
# 2.4 Encoding

# Encode target
df["MIS_Status"] = LabelEncoder().fit_transform(df["MIS_Status"])

# Encode categoricals
for col in df.select_dtypes(include="object"):
    df[col] = LabelEncoder().fit_transform(df[col])


In [148]:
# 2.5 Split and normalize: We'll keep part of data for validation and another part for the final testing

# Separate features and target
X = df.drop(columns=["MIS_Status"]).values
y = df["MIS_Status"].values

# Train/val/test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [149]:
# 2.6 Scale using training set only

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [151]:
### 3. Deep learning model and training with PyTorch

# 3.1 Convert the data to tensors and batch them. 
# Note: y_train (and _val, _test) are numpy array of type int64 so we need to explicitly cast them to torch.long as 
# torch might fail to infer the correcttensor type automatically. 

train_ds = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
val_ds = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.long))
test_ds = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

# batch the datasets to be fed to the model
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=64)
test_dl = DataLoader(test_ds, batch_size=64)


In [152]:
# 3.2 This is our MLP model. We'll make a child class of Torch's nn and will configure it. 
# Let's have 3 layers in total: the input layer of the batch size, a hidden layer, and the binary output layer.


class LoanNet(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(in_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 2)
        )
    def forward(self, x): return self.model(x)

    # Make an instance of the model
model = LoanNet(X.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [153]:
# 3.3 Let's train the model instance 

for epoch in range(10):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    model.eval()
    correct = 0
    with torch.no_grad():
        for xb, yb in val_dl:
            preds = model(xb).argmax(1)
            correct += (preds == yb).sum().item()
    acc = correct / len(val_ds)
    print(f"Epoch {epoch+1}: Loss {total_loss:.3f}, Val Acc {acc:.3f}")


Epoch 1: Loss 13.278, Val Acc 0.959
Epoch 2: Loss 6.570, Val Acc 0.971
Epoch 3: Loss 1.887, Val Acc 0.990
Epoch 4: Loss 0.594, Val Acc 0.990
Epoch 5: Loss 0.319, Val Acc 0.994
Epoch 6: Loss 0.207, Val Acc 1.000
Epoch 7: Loss 0.148, Val Acc 1.000
Epoch 8: Loss 0.107, Val Acc 1.000
Epoch 9: Loss 0.075, Val Acc 1.000
Epoch 10: Loss 0.056, Val Acc 1.000


In [154]:
# 4. Evaluation of the model

model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        preds = model(xb).argmax(1)
        all_preds.extend(preds.tolist())
        all_labels.extend(yb.tolist())

print(classification_report(all_labels, all_preds, target_names=["CHGOFF", "PIF"]))


              precision    recall  f1-score   support

      CHGOFF       1.00      0.99      1.00       108
         PIF       1.00      1.00      1.00       206

    accuracy                           1.00       314
   macro avg       1.00      1.00      1.00       314
weighted avg       1.00      1.00      1.00       314



In [158]:
print("-" * 50)

--------------------------------------------------


### Conclusion

The model learned the pattern very well and very accurately predicted on the test data. 