# Risk Assessment and Loan Approval Modeling

# Dataset
This dataset comprises 20,000 records of personal and financial data, designed to facilitate the development of predictive models for risk assessment. It serves two primary purposes:

Risk Score Regression: To predict a continuous risk score associated with each individual's likelihood of loan default or financial instability.
Binary Classification: To determine the binary outcome of loan approval, indicating whether an applicant is likely to be approved or denied for a loan.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import seaborn as sns

In [2]:
data=pd.read_csv("Loan.csv")

In [3]:
data.drop('ApplicationDate', axis=1, inplace=True)

In [12]:
data.columns

Index(['Age', 'AnnualIncome', 'CreditScore', 'EmploymentStatus',
       'EducationLevel', 'Experience', 'LoanAmount', 'LoanDuration',
       'MaritalStatus', 'NumberOfDependents', 'HomeOwnershipStatus',
       'MonthlyDebtPayments', 'CreditCardUtilizationRate',
       'NumberOfOpenCreditLines', 'NumberOfCreditInquiries',
       'DebtToIncomeRatio', 'BankruptcyHistory', 'LoanPurpose',
       'PreviousLoanDefaults', 'PaymentHistory', 'LengthOfCreditHistory',
       'SavingsAccountBalance', 'CheckingAccountBalance', 'TotalAssets',
       'TotalLiabilities', 'MonthlyIncome', 'UtilityBillsPaymentHistory',
       'JobTenure', 'NetWorth', 'BaseInterestRate', 'InterestRate',
       'MonthlyLoanPayment', 'TotalDebtToIncomeRatio', 'LoanApproved',
       'RiskScore'],
      dtype='object')

In [5]:
data[['BankruptcyHistory', 'TotalDebtToIncomeRatio', 'DebtToIncomeRatio', 'NetWorth', 'AnnualIncome', 'MonthlyIncome','LoanApproved','RiskScore']]

Unnamed: 0,BankruptcyHistory,TotalDebtToIncomeRatio,DebtToIncomeRatio,NetWorth,AnnualIncome,MonthlyIncome,LoanApproved,RiskScore
0,0,0.181077,0.358336,126928,39948,3329.000000,0,49.0
1,0,0.389852,0.330274,43609,39709,3309.083333,0,52.0
2,0,0.462157,0.244729,5205,40724,3393.666667,0,52.0
3,0,0.313098,0.436244,99452,69084,5757.000000,0,54.0
4,0,0.070210,0.078884,227019,103264,8605.333333,1,36.0
...,...,...,...,...,...,...,...,...
19995,0,0.627741,0.468077,55327,30180,2515.000000,0,55.0
19996,0,0.334418,0.317372,64002,49246,4103.833333,0,54.0
19997,0,0.357227,0.023014,103663,48958,4079.833333,0,45.0
19998,0,0.408678,0.534517,10600,41025,3418.750000,0,59.0


In [13]:

# Encoding categorical columns
label_cols = ['EmploymentStatus', 'EducationLevel', 'MaritalStatus','HomeOwnershipStatus','LoanPurpose']
le = LabelEncoder()
for col in label_cols:
    data[col] = le.fit_transform(data[col])


In [14]:
data.columns

Index(['Age', 'AnnualIncome', 'CreditScore', 'EmploymentStatus',
       'EducationLevel', 'Experience', 'LoanAmount', 'LoanDuration',
       'MaritalStatus', 'NumberOfDependents', 'HomeOwnershipStatus',
       'MonthlyDebtPayments', 'CreditCardUtilizationRate',
       'NumberOfOpenCreditLines', 'NumberOfCreditInquiries',
       'DebtToIncomeRatio', 'BankruptcyHistory', 'LoanPurpose',
       'PreviousLoanDefaults', 'PaymentHistory', 'LengthOfCreditHistory',
       'SavingsAccountBalance', 'CheckingAccountBalance', 'TotalAssets',
       'TotalLiabilities', 'MonthlyIncome', 'UtilityBillsPaymentHistory',
       'JobTenure', 'NetWorth', 'BaseInterestRate', 'InterestRate',
       'MonthlyLoanPayment', 'TotalDebtToIncomeRatio', 'LoanApproved',
       'RiskScore'],
      dtype='object')

In [15]:
data.dtypes

Age                             int64
AnnualIncome                    int64
CreditScore                     int64
EmploymentStatus                int64
EducationLevel                  int64
Experience                      int64
LoanAmount                      int64
LoanDuration                    int64
MaritalStatus                   int64
NumberOfDependents              int64
HomeOwnershipStatus             int64
MonthlyDebtPayments             int64
CreditCardUtilizationRate     float64
NumberOfOpenCreditLines         int64
NumberOfCreditInquiries         int64
DebtToIncomeRatio             float64
BankruptcyHistory               int64
LoanPurpose                     int64
PreviousLoanDefaults            int64
PaymentHistory                  int64
LengthOfCreditHistory           int64
SavingsAccountBalance           int64
CheckingAccountBalance          int64
TotalAssets                     int64
TotalLiabilities                int64
MonthlyIncome                 float64
UtilityBills

In [16]:
for i in data.columns:
    if (data[i].dtype=="object"):
        print(i)
 

In [17]:
data['TotalDebtToIncomeRatio']

0        0.181077
1        0.389852
2        0.462157
3        0.313098
4        0.070210
           ...   
19995    0.627741
19996    0.334418
19997    0.357227
19998    0.408678
19999    0.298006
Name: TotalDebtToIncomeRatio, Length: 20000, dtype: float64

In [11]:

import pandas as pd
from sklearn.model_selection import train_test_split
df=data
# Assuming df is your DataFrame
# Drop 'RiskScore' column
df = df.drop(columns=['LoanApproved'])

# Calculate correlation with the 'LoanApproved' column
correlation = df.corr()['RiskScore'].sort_values(ascending=False)

# Set a correlation threshold (absolute value)
correlation_threshold = 0.3  # Change this based on your needs

# Select features with correlation above the threshold
selected_features = correlation[abs(correlation) > correlation_threshold].index.tolist()
selected_features.remove('RiskScore')  # Exclude the target column from features

# Create the feature matrix X and target variable y
X = df[selected_features]
y = df['RiskScore']

# Split the data into train and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Selected Features based on Correlation: ", selected_features)



Selected Features based on Correlation:  ['BankruptcyHistory', 'TotalDebtToIncomeRatio', 'DebtToIncomeRatio', 'NetWorth', 'AnnualIncome', 'MonthlyIncome']


In [18]:
X_train.columns

Index(['BankruptcyHistory', 'TotalDebtToIncomeRatio', 'DebtToIncomeRatio',
       'NetWorth', 'AnnualIncome', 'MonthlyIncome'],
      dtype='object')

In [77]:

from sklearn.ensemble import GradientBoostingRegressor

# Initialize Gradient Boosting Regressor model
gb_reg = GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42)

# Fit the model on training data
gb_reg.fit(X_train, y_train)

# Predict on test data
y_pred_gb = gb_reg.predict(X_test)

# Evaluate the model
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)
print(f"Gradient Boosting Regressor MSE: {mse_gb:.4f}, R^2: {r2_gb:.4f}")


Gradient Boosting Regressor MSE: 21.0871, R^2: 0.6604


In [78]:
from sklearn.ensemble import RandomForestRegressor

# Initialize Random Forest Regressor model
rf_reg = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

# Fit the model on training data
rf_reg.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf_reg.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest Regressor MSE: {mse_rf:.4f}, R^2: {r2_rf:.4f}")


Random Forest Regressor MSE: 21.3523, R^2: 0.6561


In [79]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize Linear Regression model
lin_reg = LinearRegression()

# Fit the model on training data
lin_reg.fit(X_train, y_train)

# Predict on test data
y_pred_lin_reg = lin_reg.predict(X_test)

# Evaluate the model
mse_lin_reg = mean_squared_error(y_test, y_pred_lin_reg)
r2_lin_reg = r2_score(y_test, y_pred_lin_reg)
print(f"Linear Regression MSE: {mse_lin_reg:.4f}, R^2: {r2_lin_reg:.4f}")


Linear Regression MSE: 26.0737, R^2: 0.5800


In [80]:
lin_reg.predict(X_test[:5])


array([49.33065006, 37.04179097, 48.37562438, 50.45106397, 48.41164485])

In [81]:
X_test[:5]

Unnamed: 0,BankruptcyHistory,TotalDebtToIncomeRatio,DebtToIncomeRatio,NetWorth,AnnualIncome,MonthlyIncome
10650,0,0.090436,0.430751,7304,100761,8396.75
2041,0,0.047832,0.148908,22443,189154,15762.833333
8668,0,0.271436,0.206626,20044,73272,6106.0
1114,0,0.530868,0.184681,1714,57115,4759.583333
13902,0,0.309877,0.184185,55941,61155,5096.25


In [82]:
X_test.columns

Index(['BankruptcyHistory', 'TotalDebtToIncomeRatio', 'DebtToIncomeRatio',
       'NetWorth', 'AnnualIncome', 'MonthlyIncome'],
      dtype='object')

In [83]:
y_test[:5]

10650    41.6
2041     38.4
8668     53.0
1114     50.0
13902    51.0
Name: RiskScore, dtype: float64

In [84]:
X_train

Unnamed: 0,BankruptcyHistory,TotalDebtToIncomeRatio,DebtToIncomeRatio,NetWorth,AnnualIncome,MonthlyIncome
5894,0,0.932622,0.297167,59717,15000,1250.000000
3728,0,0.256035,0.762549,4711,49855,4154.583333
8958,0,0.299648,0.128680,2228,38932,3244.333333
7671,0,0.318753,0.308723,52964,58116,4843.000000
5999,0,0.335275,0.465777,189178,62939,5244.916667
...,...,...,...,...,...,...
11284,0,0.496160,0.179831,525387,37134,3094.500000
11964,1,0.927284,0.165276,7558,15000,1250.000000
5390,0,0.194998,0.391030,654095,48088,4007.333333
860,0,0.458332,0.676942,35612,15265,1272.083333


In [85]:
X["RiskScore"]=data["RiskScore"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["RiskScore"]=data["RiskScore"]


In [86]:
# Target variable y (LoanApproved column)
y = data['LoanApproved']

# Split the target variable into train and test sets (same test size and random state as before)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now y_train and y_test are created
print("y_train and y_test prepared.")




y_train and y_test prepared.


In [87]:
X_train[:1]

Unnamed: 0,BankruptcyHistory,TotalDebtToIncomeRatio,DebtToIncomeRatio,NetWorth,AnnualIncome,MonthlyIncome,RiskScore
5894,0,0.932622,0.297167,59717,15000,1250.0,49.0


In [88]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize Logistic Regression model
log_reg = LogisticRegression(solver='lbfgs', max_iter=500, random_state=42)

# Fit the model on training data
log_reg.fit(X_train, y_train)

# Predict on test data
y_pred_log_reg = log_reg.predict(X_test)

# Evaluate the model
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print(f"Logistic Regression Accuracy: {accuracy_log_reg:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))
print("Classification Report:")
print(classification_report(y_test, y_pred_log_reg))


Logistic Regression Accuracy: 0.9898
Confusion Matrix:
[[2968   15]
 [  26  991]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2983
           1       0.99      0.97      0.98      1017

    accuracy                           0.99      4000
   macro avg       0.99      0.98      0.99      4000
weighted avg       0.99      0.99      0.99      4000



In [93]:
import pickle
# Save the model to a file
with open('rf_regression_model.pkl', 'wb') as file:
    pickle.dump(rf_reg, file)


In [89]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize Random Forest model
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

# Fit the model on training data
rf_clf.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf_clf.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.9922
Confusion Matrix:
[[2971   12]
 [  19  998]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2983
           1       0.99      0.98      0.98      1017

    accuracy                           0.99      4000
   macro avg       0.99      0.99      0.99      4000
weighted avg       0.99      0.99      0.99      4000



In [90]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize Support Vector Classifier model
svc_clf = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)

# Fit the model on training data
svc_clf.fit(X_train, y_train)

# Predict on test data
y_pred_svc = svc_clf.predict(X_test)

# Evaluate the model
accuracy_svc = accuracy_score(y_test, y_pred_svc)
print(f"SVC Accuracy: {accuracy_svc:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svc))
print("Classification Report:")
print(classification_report(y_test, y_pred_svc))


SVC Accuracy: 0.8488
Confusion Matrix:
[[2869  114]
 [ 491  526]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.96      0.90      2983
           1       0.82      0.52      0.63      1017

    accuracy                           0.85      4000
   macro avg       0.84      0.74      0.77      4000
weighted avg       0.85      0.85      0.84      4000



In [91]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize KNN model
knn_clf = KNeighborsClassifier(n_neighbors=5, metric='minkowski')

# Fit the model on training data
knn_clf.fit(X_train, y_train)

# Predict on test data
y_pred_knn = knn_clf.predict(X_test)

# Evaluate the model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"KNN Accuracy: {accuracy_knn:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))
print("Classification Report:")
print(classification_report(y_test, y_pred_knn))


KNN Accuracy: 0.8293
Confusion Matrix:
[[2710  273]
 [ 410  607]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      2983
           1       0.69      0.60      0.64      1017

    accuracy                           0.83      4000
   macro avg       0.78      0.75      0.76      4000
weighted avg       0.82      0.83      0.83      4000



In [23]:
X_train.columns

Index(['Age', 'AnnualIncome', 'CreditScore', 'EmploymentStatus',
       'EducationLevel', 'Experience', 'LoanAmount', 'LoanDuration',
       'MaritalStatus', 'NumberOfDependents', 'HomeOwnershipStatus',
       'MonthlyDebtPayments', 'CreditCardUtilizationRate',
       'NumberOfOpenCreditLines', 'NumberOfCreditInquiries',
       'DebtToIncomeRatio', 'BankruptcyHistory', 'LoanPurpose',
       'PreviousLoanDefaults', 'PaymentHistory', 'LengthOfCreditHistory',
       'SavingsAccountBalance', 'CheckingAccountBalance', 'TotalAssets',
       'TotalLiabilities', 'MonthlyIncome', 'UtilityBillsPaymentHistory',
       'JobTenure', 'NetWorth', 'BaseInterestRate', 'InterestRate',
       'MonthlyLoanPayment', 'TotalDebtToIncomeRatio'],
      dtype='object')

In [17]:
reg_data=X_train

In [22]:
import pickle

# Save the model to a file
with open('linear_regression_model.pkl', 'wb') as file:
    pickle.dump(lin_reg, file)

# Later, to load the model back
with open('linear_regression_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)


In [29]:
a=X_test[:1].values
a

array([[3.70000000e+01, 4.91460000e+04, 5.66000000e+02, 0.00000000e+00,
        0.00000000e+00, 1.30000000e+01, 5.45810000e+04, 2.40000000e+01,
        3.00000000e+00, 1.00000000e+00, 3.00000000e+00, 1.77500000e+03,
        2.12542462e-01, 2.00000000e+00, 2.00000000e+00, 2.39522804e-01,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 2.60000000e+01,
        2.60000000e+01, 6.41400000e+03, 7.23000000e+02, 1.30217000e+05,
        1.71700000e+04, 4.09550000e+03, 7.00054401e-01, 1.00000000e+01,
        1.13047000e+05, 2.46581000e-01, 2.60258203e-01, 2.94123857e+03,
        1.15156601e+00]])

In [31]:
# Load the saved linear regression model
model_path = r"D:\Loan Approval\linear_regression_model.pkl"
with open(model_path, 'rb') as f:
    model = pickle.load(f)
    risk_score = model.predict(a)
risk_score

array([-1.68409218e+17])