In [55]:
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import ML_user_def_functions as ML_fns
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
import statsmodels
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor as cal_vif
from sklearn.model_selection import train_test_split

# Problem of Multicollinearity #
To reduce multicollinearity, one can drop columns whose vif is large and removing columns that are indirectly related to existing columns. But there one can use another method called **shrinkage**. It is also called **regularization**, and there are 1. ridge and 2. lasso techniques.
### Ridge regression
In ordinary regression the quantity that is minimized is the **RSS** value:
\begin{equation}
    RSS = \sum_{i=1}^{n}\left( y_{i} - \beta_{0} - \sum_{j=1}^{p} \beta_{j}x_{ij} \right)^{2}
\end{equation}
In ridge regresion, the estimates of the coefficients $\beta_{j}$ is regularized by adding a **shrinkage term** to the loss function that needs to be minimized, as follows:
\begin{equation}
    ridge = RSS + \lambda\sum_{j}\beta_{j}^{2}
\end{equation}
### Lasso regression
Similar to ridge regression, Lasso regression adds a penalty term:
\begin{equation}
    lasso = RSS + \lambda\sum_{j}\left| \beta_{j} \right|
\end{equation}

In [37]:
# importing the csv data
path_to_data    =    "/home/jwalitnpanchal/college_work/sem10/ML_AI/case_studies/cleaned_individual_death_claims.csv"
df    =    pd.read_csv(path_to_data)
sm_df =    sm.add_constant(df)
#print(sm_df)

choose a subset of the data set, dropping the 2nd and third coloumns

In [4]:
# dropping columns that are not needed
sub_set_sm_df    =    sm_df.drop(["category","year","life_insurer"], axis='columns')
#print(sub_set_sm_df)

In [40]:
# creating a datafram that will save against each coloumn name, the variance inflation factor (VIF)
vif_data    =    pd.DataFrame()
vif_data["columns"]    =    sub_set_sm_df.columns
#print(vif_data)
vif_data["vif"]    =    [cal_vif(sub_set_sm_df,i) for i in range(len(sub_set_sm_df.columns))]
print(vif_data)

                                 columns           vif
0                                  const  6.788540e+01
1                claims_pending_start_no  9.271340e+09
2               claims_pending_start_amt  3.658810e+09
3                    claims_intimated_no  6.928615e+14
4                   claims_intimated_amt  1.787143e+13
5                        total_claims_no  6.928615e+14
6                       total_claims_amt  1.834460e+13
7                         claims_paid_no  2.649800e+09
8                        claims_paid_amt  2.277792e+07
9                   claims_repudiated_no  1.774189e+05
10                 claims_repudiated_amt  1.706098e+04
11                    claims_rejected_no  5.258601e+04
12                   claims_rejected_amt  1.159018e+03
13                   claims_unclaimed_no  1.088278e+05
14                  claims_unclaimed_amt  3.436708e+03
15                 claims_pending_end_no  2.644257e+04
16                claims_pending_end_amt  7.385875e+03
17        

In [47]:
############### removing the coloumns with amt suffix
### selecting columns that end in amt str #############
columns    =    sub_set_sm_df.columns
column_end_amt    =    []
for i in range(2,23,2):
    column_end_amt.append(columns[i])
#print(column_end_amt)
no_amt_sm_df    =    sub_set_sm_df.drop([i for i in column_end_amt], axis="columns") 
#print(no_amt_sm_df)

['claims_pending_start_amt', 'claims_intimated_amt', 'total_claims_amt', 'claims_paid_amt', 'claims_repudiated_amt', 'claims_rejected_amt', 'claims_unclaimed_amt', 'claims_pending_end_amt', 'claims_paid_ratio_amt', 'claims_repudiated_rejected_ratio_amt', 'claims_pending_ratio_amt']


In [42]:
### stacking the colums into one parameters' variable ########
#print(sub_set_sm_df.columns[0:2])
parameter_variable    =    [np.column_stack(no_amt_sm_df[data]) for data in no_amt_sm_df.columns] 
#print(parameter_variable)

[array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1.]]), array([[  11,    0,   50,    0,    5,    0,   25,    2,   63,    8,  178,
          16,   15, 3055,   16, 1725,    1,    0, 1330,    1,    4,  912,
           8,    2,   11,    2,   19,   19,    0,    5,    5,    5,    2,
           2,    2,    2,    2,    0,    0,   38,   38,    3,    3,   35,
          35,   95,   95

In [41]:
### now checking the vif on this no amt data set ###############
no_amt_vif    =    pd.DataFrame()
no_amt_vif["columns"]    =    no_amt_sm_df.columns
no_amt_vif["vif"]      =    [cal_vif(no_amt_sm_df,i) for i in range(len(no_amt_sm_df.columns))]
print(no_amt_vif)

                                columns           vif
0                                 const  3.727452e+01
1               claims_pending_start_no  3.727140e+07
2                   claims_intimated_no  2.707304e+12
3                       total_claims_no  2.723677e+12
4                        claims_paid_no  1.814612e+08
5                  claims_repudiated_no  1.162922e+04
6                    claims_rejected_no  3.175224e+03
7                   claims_unclaimed_no  7.554493e+03
8                 claims_pending_end_no  1.805481e+03
9                  claims_paid_ratio_no  1.013725e+00
10  claims_repudiated_rejected_ratio_no  1.067972e+00
11              claims_pending_ratio_no  1.091545e+00


**we are interested in how many clains have been paid as response to remaining predictors**

In [73]:
claims_paid    =     np.asarray(no_amt_sm_df["total_claims_no"].values).reshape(-1,1)
#print(claims_paid)
column_end_amt_no_paid    =    no_amt_sm_df.drop("total_claims_no",axis="columns")
list_of_data    =    [column_end_amt_no_paid[cols] for cols in column_end_amt_no_paid.columns]
predictor_variable    =    np.column_stack(list_of_data) 
#print(predictor_variable)
### split into train and test data sets #############
train_predictors, test_predictors, train_response, test_response    =    train_test_split(predictor_variable, claims_paid,
                                                                                          random_state=43, test_size=0.25)
#print(train_predictors)
##  use the statsmodels to fit an OLS and then regularized
# OLS
model    =    sm.OLS(train_response, train_predictors)
ols_model=    model.fit()
# ridge regression
ridge_model    =    model.fit_regularized(method='elastic_net', alpha=1.0, L1_wt=0.0)
#ridge_model    =    sm.OLS(train_response, train_predictors).fit_regularized(method='elastic_net', alpha=1.0, L1_wt=0.0)

In [74]:
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.891e+13
Date:                Sat, 31 Jan 2026   Prob (F-statistic):               0.00
Time:                        02:59:06   Log-Likelihood:                 27.755
No. Observations:                 111   AIC:                            -33.51
Df Residuals:                     100   BIC:                            -3.706
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0005      0.141     -0.003      0.9

In [76]:
print(ridge_model.params)

[-7.07452183e-03  9.94828417e-01  9.94777555e-01  5.22563434e-03
  5.05849014e-03  4.49867456e-03  5.30860235e-03  5.12433130e-03
 -6.60984709e-03  1.62320915e-05  1.88406333e-05]
