In [None]:
from google.colab import files
newFile = files.upload()

Saving loans_full_schema.csv to loans_full_schema.csv


In [None]:
import pandas as pd
df = pd.read_csv("loans_full_schema.csv")
df.head()

Unnamed: 0,emp_title,emp_length,state,homeownership,annual_income,verified_income,debt_to_income,annual_income_joint,verification_income_joint,debt_to_income_joint,...,sub_grade,issue_month,loan_status,initial_listing_status,disbursement_method,balance,paid_total,paid_principal,paid_interest,paid_late_fees
0,global config engineer,3.0,NJ,MORTGAGE,90000.0,Verified,18.01,,,,...,C3,Mar-2018,Current,whole,Cash,27015.86,1999.33,984.14,1015.19,0.0
1,warehouse office clerk,10.0,HI,RENT,40000.0,Not Verified,5.04,,,,...,C1,Feb-2018,Current,whole,Cash,4651.37,499.12,348.63,150.49,0.0
2,assembly,3.0,WI,RENT,40000.0,Source Verified,21.15,,,,...,D1,Feb-2018,Current,fractional,Cash,1824.63,281.8,175.37,106.43,0.0
3,customer service,1.0,PA,RENT,30000.0,Not Verified,10.16,,,,...,A3,Jan-2018,Current,whole,Cash,18853.26,3312.89,2746.74,566.15,0.0
4,security supervisor,10.0,CA,RENT,35000.0,Verified,57.96,57000.0,Verified,37.66,...,C3,Mar-2018,Current,whole,Cash,21430.15,2324.65,1569.85,754.8,0.0


In [None]:
# Identify missing values and get info about for data
missing_values = df.isnull().sum()
print(missing_values)
print(df.info())

emp_title                            833
emp_length                           817
state                                  0
homeownership                          0
annual_income                          0
verified_income                        0
debt_to_income                        24
annual_income_joint                 8505
verification_income_joint           8545
debt_to_income_joint                8505
delinq_2y                              0
months_since_last_delinq            5658
earliest_credit_line                   0
inquiries_last_12m                     0
total_credit_lines                     0
open_credit_lines                      0
total_credit_limit                     0
total_credit_utilized                  0
num_collections_last_12m               0
num_historical_failed_to_pay           0
months_since_90d_late               7715
current_accounts_delinq                0
total_collection_amount_ever           0
current_installment_accounts           0
accounts_opened_

In [None]:
# ===================== 1. Import Libraries =====================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
import warnings
warnings.filterwarnings('ignore')

# ===================== 2. Load Dataset =====================

df = pd.read_csv('loans_full_schema.csv')

# ===================== 3. Drop Columns with High Missingness =====================
missing_threshold = 0.4  # Drop columns with more than 40% missing
df = df.loc[:, df.isnull().mean() < missing_threshold]

# ===================== 4. Drop Irrelevant/ID Columns =====================
drop_cols = ['emp_title', 'issue_month', 'sub_grade']
df = df.drop(columns=drop_cols, errors='ignore')

# ===================== 5. Handle Missing Values =====================
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Impute numeric
imputer_num = SimpleImputer(strategy='mean')
df[num_cols] = imputer_num.fit_transform(df[num_cols])

# Impute categorical
imputer_cat = SimpleImputer(strategy='most_frequent')
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

# ===================== 6. Encode Categorical Columns =====================
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# ===================== 7. Define Features and Target =====================
X = df.drop('loan_status', axis=1)
y = df['loan_status']

# Binary classification
y = le.fit_transform(y)  # Converts to 0/1 if binary, else multi-class

# ===================== 8. Train-Test Split and Scaling =====================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ===================== 9. Train and Evaluate Models =====================
# ---- Logistic Regression ----
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

print("\n=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

# ---- SVM Classifier ----
svm = SVC()
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)

print("\n=== Support Vector Machine ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

# ---- Random Forest ----
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("\n=== Random Forest ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# ---- Artificial Neural Network ----
ann = Sequential()
ann.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'))
ann.add(Dense(32, activation='relu'))
ann.add(Dense(1, activation='sigmoid'))  # Use 'softmax' if multiclass

ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
ann.fit(X_train_scaled, y_train, epochs=20, batch_size=32, verbose=1)

loss, acc = ann.evaluate(X_test_scaled, y_test)
print("\n=== ANN ===")
print(f"Accuracy: {acc:.4f}")



=== Logistic Regression ===
Accuracy: 0.9785
              precision    recall  f1-score   support

           1       0.98      1.00      0.99      1858
           2       0.95      0.97      0.96       104
           3       0.00      0.00      0.00        18
           4       0.00      0.00      0.00         9
           5       1.00      0.36      0.53        11

    accuracy                           0.98      2000
   macro avg       0.59      0.47      0.50      2000
weighted avg       0.97      0.98      0.97      2000


=== Support Vector Machine ===
Accuracy: 0.972
              precision    recall  f1-score   support

           1       0.97      1.00      0.99      1858
           2       0.92      0.90      0.91       104
           3       0.00      0.00      0.00        18
           4       0.00      0.00      0.00         9
           5       0.00      0.00      0.00        11

    accuracy                           0.97      2000
   macro avg       0.38      0.38    