In [1]:
# 📦 1. Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Step 1: Load with low_memory=False and fix dtypes
import os

file_path = r'K:\new_SBA_Dang.csv'
print("File exists:", os.path.exists(file_path))

df = pd.read_csv(file_path, dtype=str)
df.head()

# Step 2: Convert numeric columns
# 1️⃣ Drop irrelevant or high-cardinality columns
drop_cols = ['Name', 'City', 'BankName', 'Zip', 'State', 'ApprovalDate', 'ChgOffDate', 'DisbursementDate', 'MIS_Status', 'Bank', 'BankState']
df.drop(columns=drop_cols, inplace=True, errors='ignore')
df.head()


File exists: True


Unnamed: 0,LoanNr_ChkDgt,NAICS,ApprovalFY,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,DisbursementGross,BalanceGross,ChgOffPrinGr,GrAppv,SBA_Appv
0,1000014003,451120,1997,84,4,2,0,0,1,0,N,Y,"$60,000.00",$0.00,$0.00,"$60,000.00","$48,000.00"
1,1000024006,722410,1997,60,2,2,0,0,1,0,N,Y,"$40,000.00",$0.00,$0.00,"$40,000.00","$32,000.00"
2,1000034009,621210,1997,180,7,1,0,0,1,0,N,N,"$287,000.00",$0.00,$0.00,"$287,000.00","$215,250.00"
3,1000044001,0,1997,60,2,1,0,0,1,0,N,Y,"$35,000.00",$0.00,$0.00,"$35,000.00","$28,000.00"
4,1000054004,0,1997,240,14,1,7,7,1,0,N,N,"$229,000.00",$0.00,$0.00,"$229,000.00","$229,000.00"


In [3]:
df.dropna(inplace=True)
df.columns
df['DisbursementGross'] = df['DisbursementGross'].replace('[\$,]', '', regex=True).astype(float)
df['ChgOffPrinGr'] = df['ChgOffPrinGr'].replace('[\$,]', '', regex=True).astype(float)
df['Default'] = df['ChgOffPrinGr'].astype(float) > 0

df['DisbursementGross'] = df['DisbursementGross'].astype(float)
print(df.columns.tolist())

['LoanNr_ChkDgt', 'NAICS', 'ApprovalFY', 'Term', 'NoEmp', 'NewExist', 'CreateJob', 'RetainedJob', 'FranchiseCode', 'UrbanRural', 'RevLineCr', 'LowDoc', 'DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv', 'Default']


  df['DisbursementGross'] = df['DisbursementGross'].replace('[\$,]', '', regex=True).astype(float)
  df['ChgOffPrinGr'] = df['ChgOffPrinGr'].replace('[\$,]', '', regex=True).astype(float)


In [4]:
X = df.drop(columns=['Default', 'DisbursementGross'])
y = df['Default']
amount = df['DisbursementGross']

In [5]:
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,DisbursementGross,ChgOffPrinGr,Default,LoanNr_ChkDgt_1000024006,LoanNr_ChkDgt_1000034009,LoanNr_ChkDgt_1000044001,LoanNr_ChkDgt_1000054004,LoanNr_ChkDgt_1000084002,LoanNr_ChkDgt_1000093009,LoanNr_ChkDgt_1000094005,...,"SBA_Appv_$461,250.00","SBA_Appv_$48,000.00","SBA_Appv_$499,998.00","SBA_Appv_$50,000.00","SBA_Appv_$56,000.00","SBA_Appv_$56,250.00","SBA_Appv_$56,800.00","SBA_Appv_$60,000.00","SBA_Appv_$80,000.00","SBA_Appv_$937,500.00"
0,60000.0,0.0,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
1,40000.0,0.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,287000.0,0.0,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,35000.0,0.0,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,229000.0,0.0,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
X = df.drop(columns=['Default', 'DisbursementGross'])
y = df['Default']
amount = df['DisbursementGross']

In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
X_train, X_val, y_train, y_val, amt_train, amt_val = train_test_split(
    X_scaled, y, amount, test_size=0.4, stratify=y, random_state=42
)


In [9]:
# 💰 3. Cost-sensitive Net Profit Function
def calculate_net_profit(y_true, y_pred, disbursement_amount):
    """
    Net profit = 
    + 0.05 * disbursement if correctly classified as Paid
    - 0.25 * disbursement if Default was wrongly classified as Paid
    """
    profit = []
    for yt, yp, amt in zip(y_true, y_pred, disbursement_amount):
        if yp == 0 and yt == 0:      # True Negative: Denied, actually default
            profit.append(0)
        elif yp == 1 and yt == 0:    # False Positive: Approved, actually default
            profit.append(-0.25 * amt)
        elif yp == 1 and yt == 1:    # True Positive: Approved, paid in full
            profit.append(0.05 * amt)
        elif yp == 0 and yt == 1:    # False Negative: Denied, but would have paid
            profit.append(0)
    return np.sum(profit)


In [10]:
# 🔁 4. Model Runner Template
def run_model(model, model_name):
    model.fit(X_train, y_train)
    probs = model.predict_proba(X_val)[:, 1]  # Probabilities
    threshold = 0.5  # Later optimize this
    preds = (probs >= threshold).astype(int)

    print(f"\n🧠 Model: {model_name}")
    print(confusion_matrix(y_val, preds))
    print(classification_report(y_val, preds, digits=4))

    net_profit = calculate_net_profit(y_val, preds, amt_val)
    print(f"💵 Net Profit: ${net_profit:,.2f}")

    return model_name, net_profit, f1_score(y_val, preds), roc_auc_score(y_val, probs)


In [11]:
# 🚀 5. Initialize and Run Models

results = []

# kNN
knn = KNeighborsClassifier(n_neighbors=5)
results.append(run_model(knn, "kNN"))

# Decision Tree
dt = DecisionTreeClassifier(max_depth=5)
results.append(run_model(dt, "Decision Tree"))

# Bagging
bag = BaggingClassifier(n_estimators=50)
results.append(run_model(bag, "Bagging"))

# Random Forest
rf = RandomForestClassifier(n_estimators=100)
results.append(run_model(rf, "Random Forest"))

# Boosting
boost = AdaBoostClassifier(n_estimators=100)
results.append(run_model(boost, "Boosting"))

# Logistic Regression
logit = LogisticRegression(solver='liblinear')
results.append(run_model(logit, "Logistic Regression"))

# Ridge
ridge = RidgeClassifier()
#results.append(run_model(ridge, "Ridge"))

#DISCRIMINANT ANALYSIS:
# LDA
lda = LinearDiscriminantAnalysis()
results.append(run_model(lda, "LDA"))

# QDA
qda = QuadraticDiscriminantAnalysis()
results.append(run_model(qda, "QDA"))

# Neural Network
nn = MLPClassifier(hidden_layer_sizes=(20,), activation='relu', solver='adam', max_iter=1000)
results.append(run_model(nn, "Neural Network"))



🧠 Model: kNN
[[19  0]
 [ 1  0]]
              precision    recall  f1-score   support

       False     0.9500    1.0000    0.9744        19
        True     0.0000    0.0000    0.0000         1

    accuracy                         0.9500        20
   macro avg     0.4750    0.5000    0.4872        20
weighted avg     0.9025    0.9500    0.9256        20

💵 Net Profit: $0.00

🧠 Model: Decision Tree
[[19  0]
 [ 0  1]]
              precision    recall  f1-score   support

       False     1.0000    1.0000    1.0000        19
        True     1.0000    1.0000    1.0000         1

    accuracy                         1.0000        20
   macro avg     1.0000    1.0000    1.0000        20
weighted avg     1.0000    1.0000    1.0000        20

💵 Net Profit: $30,000.00


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



🧠 Model: Bagging
[[19  0]
 [ 1  0]]
              precision    recall  f1-score   support

       False     0.9500    1.0000    0.9744        19
        True     0.0000    0.0000    0.0000         1

    accuracy                         0.9500        20
   macro avg     0.4750    0.5000    0.4872        20
weighted avg     0.9025    0.9500    0.9256        20

💵 Net Profit: $0.00


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



🧠 Model: Random Forest
[[19  0]
 [ 1  0]]
              precision    recall  f1-score   support

       False     0.9500    1.0000    0.9744        19
        True     0.0000    0.0000    0.0000         1

    accuracy                         0.9500        20
   macro avg     0.4750    0.5000    0.4872        20
weighted avg     0.9025    0.9500    0.9256        20

💵 Net Profit: $0.00

🧠 Model: Boosting
[[19  0]
 [ 0  1]]
              precision    recall  f1-score   support

       False     1.0000    1.0000    1.0000        19
        True     1.0000    1.0000    1.0000         1

    accuracy                         1.0000        20
   macro avg     1.0000    1.0000    1.0000        20
weighted avg     1.0000    1.0000    1.0000        20

💵 Net Profit: $30,000.00

🧠 Model: Logistic Regression
[[ 2 17]
 [ 0  1]]
              precision    recall  f1-score   support

       False     1.0000    0.1053    0.1905        19
        True     0.0556    1.0000    0.1053         1

    acc

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



🧠 Model: QDA
[[17  2]
 [ 0  1]]
              precision    recall  f1-score   support

       False     1.0000    0.8947    0.9444        19
        True     0.3333    1.0000    0.5000         1

    accuracy                         0.9000        20
   macro avg     0.6667    0.9474    0.7222        20
weighted avg     0.9667    0.9000    0.9222        20

💵 Net Profit: $-38,350.00

🧠 Model: Neural Network
[[12  7]
 [ 0  1]]
              precision    recall  f1-score   support

       False     1.0000    0.6316    0.7742        19
        True     0.1250    1.0000    0.2222         1

    accuracy                         0.6500        20
   macro avg     0.5625    0.8158    0.4982        20
weighted avg     0.9563    0.6500    0.7466        20

💵 Net Profit: $-618,619.00


In [None]:
# 🏁 6. Final Comparison Table
result_df = pd.DataFrame(results, columns=['Model', 'Net Profit', 'F1 Score', 'AUC'])
result_df.sort_values(by='Net Profit', ascending=False, inplace=True)

print("\n📊 Model Comparison:")
print(result_df) 


📊 Model Comparison:
                 Model  Net Profit  F1 Score       AUC
1        Decision Tree     30000.0  1.000000  1.000000
2              Bagging     30000.0  1.000000  1.000000
4             Boosting     30000.0  1.000000  1.000000
0                  kNN         0.0  0.000000  0.500000
3        Random Forest         0.0  0.000000  1.000000
7                  QDA    -38350.0  0.500000  0.947368
8       Neural Network   -825444.0  0.125000  0.947368
5  Logistic Regression   -863086.5  0.105263  0.894737
6                  LDA   -934336.5  0.095238  0.500000
