In [9]:
import pandas as pd
import statsmodels.api as sm

file = '../data/mortgage.csv'
loan_df = pd.read_csv(file)
pd.set_option('display.max_columns', None)
print(loan_df.head())



        id  year     issue_d  final_d  emp_length_int home_ownership  \
0  1062177  2011  01/12/2011  1062013             2.0       MORTGAGE   
1  1049352  2011  01/12/2011  1042013             1.0       MORTGAGE   
2  1062976  2011  01/12/2011  1042013            10.0           RENT   
3  1058564  2011  01/12/2011  1122014             6.0           RENT   
4  1061837  2011  01/12/2011  1122014             7.0           RENT   

  income_category  annual_inc  income_cat  loan_amount        term  term_cat  \
0             Low       44400           1        15000   36 months         1   
1             Low      100000           1         6600   36 months         1   
2             Low       45000           1         4000   60 months         2   
3             Low       57600           1         8000   36 months         1   
4             Low       60000           1        15000   36 months         1   

  application_type purpose  purpose_cat interest_payments loan_condition  \
0       IN

In [15]:
# variables
features = ['annual_inc', 'emp_length_int', 'interest_rate', 'dti', 'grade_cat']
target = 'loan_amount'

X = loan_df[features]
y = loan_df[target]

X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

summary = model.summary()
print(summary)

                            OLS Regression Results                            
Dep. Variable:            loan_amount   R-squared:                       0.228
Model:                            OLS   Adj. R-squared:                  0.227
Method:                 Least Squares   F-statistic:                     218.2
Date:                Sun, 23 Jun 2024   Prob (F-statistic):          1.64e-204
Time:                        17:25:05   Log-Likelihood:                -38486.
No. Observations:                3707   AIC:                         7.698e+04
Df Residuals:                    3701   BIC:                         7.702e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const           4422.4237    643.432      6.

#results
1) The model explains about 23% of the variation in loan amounts (R-squared = 0.228). This means it's not very accurate, as many other factors also affect loan amounts.
2) The F-statistic is 218.2, and the associated p-value (Prob > F) is extremely low (1.64e-204), indicating that the overall model is statistically significant and that at least one of the predictors is significantly related to the loan amount.
3) Annual Income (annual_inc): The coefficient is 0.0697, with a very low p-value (<0.0001), indicating that annual income is a significant positive predictor of loan amount.
4) Employment Length (emp_length_int): The coefficient is 142.4458, with a p-value of 0.000. This indicates that for each additional year of employment, the loan amount increases by approximately 142.45 dollars.
5) Interest Rate (interest_rate): The coefficient is 378.9188, with a p-value of 0.000. This suggests that a higher interest rate is associated with a higher loan amount. For each percentage point increase in the interest rate, the loan amount increases by about 378.92 dollars.
6) Debt-to-Income Ratio (dti): The coefficient is -46.5928, with a p-value of 0.003, indicating that higher DTI ratios are associated with lower loan amounts. 
7) Grade Category (grade_cat): The coefficient is -385.4102, with a p-value of 0.200. This suggests that grade category is not a statistically significant predictor of loan amount in this model.

In [18]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# variables
features = ['annual_inc', 'emp_length_int', 'interest_rate', 'dti', 'grade_cat']
target = 'loan_condition'

X = loan_df[features]
y = loan_df[target]

# 
y = y.map({'Good Loan': 1, 'Bad Loan': 0})

# testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Add a constant
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

# Fit the logistic regression model
logit_model = sm.Logit(y_train, X_train).fit()


print(logit_model.summary())

# predictions
y_pred_prob = logit_model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)


print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Optimization terminated successfully.
         Current function value: 0.335409
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:         loan_condition   No. Observations:                 2965
Model:                          Logit   Df Residuals:                     2959
Method:                           MLE   Df Model:                            5
Date:                Sun, 23 Jun 2024   Pseudo R-squ.:                 0.03955
Time:                        17:27:55   Log-Likelihood:                -994.49
converged:                       True   LL-Null:                       -1035.4
Covariance Type:            nonrobust   LLR p-value:                 3.344e-16
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.2024      0.066     33.601      0.000       2.074       2.331
x1             0.4160      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
