In [1]:
# https://machinelearningmastery.com/lasso-regression-with-python/
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso

In [3]:
df = pd.read_csv("0416_CRR_df.csv")
df = df.drop(df.columns[0], axis=1)
df

Unnamed: 0,CONFIRM_YN_NA,CONFIRM_YN_Y,COMPLEX_CS_FG_NA,COMPLEX_CS_FG_Y,AUTHORIZED_NA,AUTHORIZED_Y,BEARER_SHARE_NA,BEARER_SHARE_Y,ISSUE_BEARER_NA,ISSUE_BEARER_Y,...,CMFCUS1_BUSINESS_CODE,RISK_LEVEL,JOB_RISK,CMFCUS25_SP_RATING,CMFCUS25_MOODYS_RATING,CMFCUS25_SRT_SP_RATING,CMFCUS25_SRT_MOODYS_RATING,CMFCUS25_SRT_FITCH_RATING,NA count,SAR_ALERT
0,0,1,0,0,0,0,1,0,0,0,...,2,3,2,0,0,0,0,0,10,0
1,0,1,1,0,0,0,1,0,1,0,...,2,2,1,0,0,0,0,0,12,0
2,1,0,1,0,0,0,1,0,0,0,...,3,3,3,0,0,0,0,0,11,0
3,0,1,0,0,1,0,0,0,0,0,...,1,2,3,0,0,0,0,0,12,0
4,0,1,0,0,0,0,0,0,0,0,...,2,2,2,0,0,0,0,0,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80127,0,1,1,0,1,0,0,0,1,0,...,1,2,3,0,0,0,0,0,16,0
80128,0,1,0,0,0,0,1,0,1,0,...,2,3,3,0,0,0,0,0,13,0
80129,0,1,0,0,1,0,0,1,0,0,...,1,3,3,0,0,0,0,0,14,0
80130,1,0,0,0,1,0,1,0,0,0,...,2,2,3,0,0,0,0,0,14,0


In [4]:
df = df.drop(columns='NA count', axis=1)
df

Unnamed: 0,CONFIRM_YN_NA,CONFIRM_YN_Y,COMPLEX_CS_FG_NA,COMPLEX_CS_FG_Y,AUTHORIZED_NA,AUTHORIZED_Y,BEARER_SHARE_NA,BEARER_SHARE_Y,ISSUE_BEARER_NA,ISSUE_BEARER_Y,...,REL_PEPS_COUNT,CMFCUS1_BUSINESS_CODE,RISK_LEVEL,JOB_RISK,CMFCUS25_SP_RATING,CMFCUS25_MOODYS_RATING,CMFCUS25_SRT_SP_RATING,CMFCUS25_SRT_MOODYS_RATING,CMFCUS25_SRT_FITCH_RATING,SAR_ALERT
0,0,1,0,0,0,0,1,0,0,0,...,0,2,3,2,0,0,0,0,0,0
1,0,1,1,0,0,0,1,0,1,0,...,0,2,2,1,0,0,0,0,0,0
2,1,0,1,0,0,0,1,0,0,0,...,0,3,3,3,0,0,0,0,0,0
3,0,1,0,0,1,0,0,0,0,0,...,0,1,2,3,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,2,2,2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80127,0,1,1,0,1,0,0,0,1,0,...,0,1,2,3,0,0,0,0,0,0
80128,0,1,0,0,0,0,1,0,1,0,...,0,2,3,3,0,0,0,0,0,0
80129,0,1,0,0,1,0,0,1,0,0,...,0,1,3,3,0,0,0,0,0,0
80130,1,0,0,0,1,0,1,0,0,0,...,0,2,2,3,0,0,0,0,0,0


In [5]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = df.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(len(df.columns))]
  
print(vif_data)

                        feature       VIF
0                 CONFIRM_YN_NA  4.960204
1                  CONFIRM_YN_Y  6.610092
2              COMPLEX_CS_FG_NA  1.629500
3               COMPLEX_CS_FG_Y  1.010269
4                 AUTHORIZED_NA  1.647014
..                          ...       ...
117      CMFCUS25_MOODYS_RATING  1.004275
118      CMFCUS25_SRT_SP_RATING  1.004992
119  CMFCUS25_SRT_MOODYS_RATING  1.002261
120   CMFCUS25_SRT_FITCH_RATING  1.001481
121                   SAR_ALERT  1.272449

[122 rows x 2 columns]


In [6]:
df.columns[np.where(vif_data.VIF >= 10)]

Index(['CMFCUS25_AE_TYPE_N', 'CMFCUS25_CERTI_FLAG_NA', 'CMFCUS25_CERTI_FLAG_Y',
       'CORP_TYPE_4', 'CORP_TYPE_5', 'CORP_TYPE_NA', 'CORP_TYPE_Z',
       'CMFCUS1_BUSINESS_FLAG', 'DP_FG', 'RISK_LEVEL'],
      dtype='object')

In [23]:
new_df = df[df.columns[np.where(vif_data.VIF < 10)]]
new_df

Unnamed: 0,CONFIRM_YN_NA,CONFIRM_YN_Y,COMPLEX_CS_FG_NA,COMPLEX_CS_FG_Y,AUTHORIZED_NA,AUTHORIZED_Y,BEARER_SHARE_NA,BEARER_SHARE_Y,ISSUE_BEARER_NA,ISSUE_BEARER_Y,...,REL_ADVRS_COUNT,REL_PEPS_COUNT,CMFCUS1_BUSINESS_CODE,JOB_RISK,CMFCUS25_SP_RATING,CMFCUS25_MOODYS_RATING,CMFCUS25_SRT_SP_RATING,CMFCUS25_SRT_MOODYS_RATING,CMFCUS25_SRT_FITCH_RATING,SAR_ALERT
0,0,1,0,0,0,0,1,0,0,0,...,0,0,2,2,0,0,0,0,0,0
1,0,1,1,0,0,0,1,0,1,0,...,0,0,2,1,0,0,0,0,0,0
2,1,0,1,0,0,0,1,0,0,0,...,0,0,3,3,0,0,0,0,0,0
3,0,1,0,0,1,0,0,0,0,0,...,0,0,1,3,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,2,2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80127,0,1,1,0,1,0,0,0,1,0,...,0,0,1,3,0,0,0,0,0,0
80128,0,1,0,0,0,0,1,0,1,0,...,0,0,2,3,0,0,0,0,0,0
80129,0,1,0,0,1,0,0,1,0,0,...,0,0,1,3,0,0,0,0,0,0
80130,1,0,0,0,1,0,1,0,0,0,...,0,0,2,3,0,0,0,0,0,0


In [19]:
# Define a function to Calculate information value
# https://gist.github.com/danyashorokh/b2f894c2ab29ba927944493597dca152
def calc_iv(df, feature, target, pr=0):

    lst = []

    for i in range(df[feature].nunique()):
        val = list(df[feature].unique())[i]
        lst.append([feature, val, df[df[feature] == val].count()[feature], df[(df[feature] == val) & (df[target] == 1)].count()[feature]])

    data = pd.DataFrame(lst, columns=['Variable', 'Value', 'All', 'Bad'])
    data = data[data['Bad'] > 0]

    data['Share'] = data['All'] / data['All'].sum()
    data['Bad Rate'] = data['Bad'] / data['All']
    data['Distribution Good'] = (data['All'] - data['Bad']) / (data['All'].sum() - data['Bad'].sum())
    data['Distribution Bad'] = data['Bad'] / data['Bad'].sum()
    data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])
    data['IV'] = (data['WoE'] * (data['Distribution Good'] - data['Distribution Bad'])).sum()

    data = data.sort_values(by=['Variable', 'Value'], ascending=True)

    if pr == 1:
        print(data)

    return data['IV'].values[0]

In [27]:
calc_iv(new_df, new_df.columns[0], new_df.columns[-1])

0.0020815510667355095

In [28]:
IV_df = pd.DataFrame()
IV_df["feature"] = new_df.columns[0:-1]
  
# calculating IV for each feature
IV_df["IV"] = [calc_iv(new_df, new_df.columns[i], new_df.columns[-1]) for i in range(len(new_df.columns)-1)]
  
print(IV_df)

                        feature        IV
0                 CONFIRM_YN_NA  0.002082
1                  CONFIRM_YN_Y  0.000562
2              COMPLEX_CS_FG_NA  0.000019
3               COMPLEX_CS_FG_Y  0.000692
4                 AUTHORIZED_NA  0.000536
..                          ...       ...
106          CMFCUS25_SP_RATING  0.001405
107      CMFCUS25_MOODYS_RATING  0.000000
108      CMFCUS25_SRT_SP_RATING  0.000161
109  CMFCUS25_SRT_MOODYS_RATING  0.002712
110   CMFCUS25_SRT_FITCH_RATING  0.000015

[111 rows x 2 columns]


In [29]:
new_df.columns[np.where(IV_df.IV > 0.1)]

Index(['AMT_RANGE_2.0', 'AMT_RANGE_3.0', 'OBU_ANNUAL_INCOME_2.0',
       'OBU_ANNUAL_INCOME_3.0', 'CMFCUS1_FINANCIAL_ACT', 'EB_FG'],
      dtype='object')

In [None]:
# After the screening by VIF and IV, there are only four kinds of variable can be left

In [30]:
new_df = new_df[new_df.columns[np.where(IV_df.IV > 0.1)]]
new_df

Unnamed: 0,AMT_RANGE_2.0,AMT_RANGE_3.0,OBU_ANNUAL_INCOME_2.0,OBU_ANNUAL_INCOME_3.0,CMFCUS1_FINANCIAL_ACT,EB_FG
0,0,0,0,1,0,0
1,0,0,0,0,1,0
2,0,0,0,1,0,0
3,0,0,0,0,0,1
4,0,0,0,0,1,0
...,...,...,...,...,...,...
80127,0,0,0,0,0,0
80128,0,0,0,0,0,1
80129,0,0,0,0,1,1
80130,0,0,0,0,1,1


In [31]:
new_df = pd.concat([new_df,df.iloc[:,-1]],axis=1)
new_df.to_csv("CRR_VIF_IV_screen.csv")

In [None]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [None]:
# define model
model = Lasso()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
# define grid
grid = dict()
grid['alpha'] = np.arange(0, 1, 0.01)
# define search
search = GridSearchCV(model, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X, y)
# summarize
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

In [None]:
# Alpha is the weight of punishment (1=Lasso, 0=OLS, 0-1=elastic net)