# Lasso Regression on Loan Default Data
## With Implementations in Scikit-Learn and with Linear Algebra


## Import libraries and Data

In [1]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import linear_model

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import time

In [2]:
df = pd.read_csv('../Data/cleaned.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,loan_amount,term,income,Credit_Score,Status,loan_limit_cf,loan_limit_ncf,submission_of_application_not_inst,submission_of_application_to_inst,...,Region_North-East,Region_central,Region_south,age_25-34,age_35-44,age_45-54,age_55-64,age_65-74,age_<25,age_>74
0,0,116500,360.0,1740.0,758,1,1,0,0,1,...,0,0,1,1,0,0,0,0,0,0
1,1,206500,360.0,4980.0,552,1,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,2,406500,360.0,9480.0,834,0,1,0,0,1,...,0,0,1,0,1,0,0,0,0,0
3,3,456500,360.0,11880.0,587,0,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,4,696500,360.0,10440.0,602,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0


In [3]:
# Split target and features
X = df.drop('Credit_Score', axis=1)
y = df['Credit_Score']
display(y)

0         758
1         552
2         834
3         587
4         602
         ... 
135103    659
135104    569
135105    702
135106    737
135107    830
Name: Credit_Score, Length: 135108, dtype: int64

In [4]:
# Training and testing data
X_train, X_test, y_train, y_test = train_test_split(
  X,
  y,
  test_size=0.3, 
  random_state=42
)
X_train.head()

Unnamed: 0.1,Unnamed: 0,loan_amount,term,income,Status,loan_limit_cf,loan_limit_ncf,submission_of_application_not_inst,submission_of_application_to_inst,co-applicant_credit_type_CIB,...,Region_North-East,Region_central,Region_south,age_25-34,age_35-44,age_45-54,age_55-64,age_65-74,age_<25,age_>74
34829,34829,626500,360.0,15720.0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
116059,116059,226500,360.0,3780.0,0,1,0,0,1,1,...,0,0,0,0,0,1,0,0,0,0
18200,18200,526500,360.0,10020.0,0,1,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0
67940,67940,46500,360.0,780.0,1,1,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
91562,91562,726500,360.0,11640.0,0,1,0,0,1,0,...,0,0,1,1,0,0,0,0,0,0


## Fitting Scikit-Learn LASSO Regression

In [5]:
start_time = time.time()
mean_squared_errors = []
range =  np.arange(0.01, 2, 0.1)

# Lasso Regression
for alpha in range:
  lasso = linear_model.Lasso(alpha=alpha)
  lasso.fit(X_train, y_train)

  # Predict and calculate MSE
  y_pred = lasso.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  mean_squared_errors.append(mse)

# Plot MSE vs Alpha
fig = go.Figure(
  data=px.scatter(
    x=range,
    y=mean_squared_errors,
    title="Scikit-Learn LASSO Regression MSE vs Alpha Penalty",
    labels = {
      'x': 'Alpha',
      'y': 'MSE'
    }
  )
)

fig.show()

In [6]:
print(
  f"Time taken: {time.time() - start_time} seconds"
)

Time taken: 3.6953959465026855 seconds


### Best Model is about Alpha = 0.40 without overfitting 

In [7]:
alpha = 0.4
lasso = linear_model.Lasso(alpha=alpha)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

# Print predictions from this model
y_pred

array([699.1149367 , 699.63347899, 699.01724506, ..., 699.69943952,
       699.27318391, 700.48033604])

In [8]:
# Sort features by coefficient
coef_df = pd.DataFrame({'feature': X_test.columns, 'coef': lasso.coef_})
coef_df.sort_values(by='coef', ascending=False)

Unnamed: 0,feature,coef
1,loan_amount,0.000005
32,occupancy_type_pr,0.000000
34,loan_type_type1,0.000000
35,loan_type_type2,-0.000000
36,loan_type_type3,-0.000000
...,...,...
28,Credit_Worthiness_l2,-0.000000
29,approv_in_adv_nopre,0.000000
0,Unnamed: 0,-0.000008
3,income,-0.000025


### Linear Algebra Implementation

$beta_{lasso} = (X^TX + \alpha I)^{-1} X^TY$

In [9]:
mean_squared_errors = []
I = np.identity(X_train.shape[1])
range =  np.arange(0.01, 30, 1)
start_time = time.time()

# Lasso Regression
for alpha in range:
  beta_lasso = np.dot(
    np.dot(
      np.linalg.inv(
        np.dot(
          X_train.T,
          X_train
        ) + alpha * I
      ),
      X_train.T
    )
    ,y_train
  )
  
  # Predict and calculate MSE
  y_pred = np.dot(beta_lasso, X_test.T)
  mse = mean_squared_error(y_test, y_pred)
  mean_squared_errors.append(mse)

# Plot MSE vs Alpha
fig = go.Figure(
  data=px.scatter(
    x=range,
    y=mean_squared_errors,
    title="Linear Algebra LASSO Regression MSE vs Alpha Penalty",
    labels = {
      'x': 'Alpha',
      'y': 'MSE'
    }
  )
)

fig.show()

In [10]:
print(
  f"Time taken: {time.time() - start_time} seconds"
)

Time taken: 4.531019926071167 seconds


### Smallest MSE is about Alpha = 17

In [11]:
# Fit Lasso with best alpha
alpha = 17 
I = np.identity(X_train.shape[1])

beta_lasso = np.dot(
  np.dot(
    np.linalg.inv(
      np.dot(
        X_train.T,
        X_train
      ) + alpha * I
    ),
    X_train.T
  )
  ,y_train
)

# Print predictions from this model
np.dot(beta_lasso, X_test.T)

array([699.50060926, 699.08894103, 698.9748446 , ..., 702.82086502,
       699.04121109, 701.92055688])

In [12]:
# Sort features by coefficient
coef_df = pd.DataFrame({'feature': X_test.columns, 'coef': beta_lasso})
coef_df.sort_values(by='coef', ascending=False)

Unnamed: 0,feature,coef
24,business_or_commercial_nob/c,46.362488
16,Security_Type_direct,43.689665
14,construction_type_sb,43.689665
11,Secured_by_home,43.689665
26,open_credit_opc,43.154929
...,...,...
4,Status,0.861535
1,loan_amount,0.000004
3,income,-0.000007
0,Unnamed: 0,-0.000008


### TODO: Remove features with coef close to 0
### TODO: Fit new model