In [215]:
import pandas as pd 
import numpy as np 
import plotly.express as px 
import seaborn as sns 
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.io as pio
import kaleido
import os
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.model_selection import train_test_split,StratifiedKFold
BASE_PATH = '../data/raw'
TRAINING_PATH = os.path.join(BASE_PATH,'cs-training.csv')
TEST_PATH = os.path.join(BASE_PATH,'cs-test.csv')

TARGET = 'SeriousDlqin2yrs'


INTERIM_BASE_PATH = '../data/interim'
TRAINING_INTERIM_PATH = os.path.join(INTERIM_BASE_PATH,'training_interim.csv')
TEST_INTERIM_PATH = os.path.join(INTERIM_BASE_PATH,'test_interim.csv')

In [216]:
data = pd.read_csv(TRAINING_PATH)

In [217]:
median_income = data["MonthlyIncome"].median()

number_of_dependents_mode = data["NumberOfDependents"].mode()[0]
data["MonthlyIncome"] = data["MonthlyIncome"].fillna(
    median_income
)
data["NumberOfDependents"] = data["NumberOfDependents"].fillna(
    number_of_dependents_mode
)

In [208]:
def outlier_treatment(data,col):
    q1,q3 = np.percentile(data[col] ,[25,75])
    iqr = q3-q1
    lower_range = q1 - (1.5 * iqr)
    upper_range = q3 + (1.5 * iqr)
    lower_range,upper_range
    data.drop(data[ (data[col] > upper_range) | (data[col] < lower_range) ].index , inplace=True)
    return data

In [166]:
#data = outlier_treatment(data,'MonthlyIncome')
#data = outlier_treatment(data,'RevolvingUtilizationOfUnsecuredLines')
#data = outlier_treatment(data,'DebtRatio')

In [209]:
data = data[data['RevolvingUtilizationOfUnsecuredLines']<=1]
data = data[data['DebtRatio']<=1]

In [None]:
data['

In [198]:
#log income 
scaler_income = RobustScaler()
data.loc[:,'STDmonthly'] = scaler_income.fit_transform(data['MonthlyIncome'].to_numpy().reshape(-1,1))
#data['LogDebtRatio'] = np.log1p(data['DebtRatio'])
#data['LogRevolvingUtilizationOfUnsecuredLines'] = np.log1p(data['RevolvingUtilizationOfUnsecuredLines'])

In [199]:
scaler_debt = RobustScaler()
data.loc[:,'STDDebtRatio'] = scaler_debt.fit_transform(data['DebtRatio'].to_numpy().reshape(-1,1))

In [200]:
scaler_revolving = RobustScaler()
data.loc[:,'STDRevolving'] = scaler_revolving.fit_transform(data['RevolvingUtilizationOfUnsecuredLines'].to_numpy().reshape(-1,1))

In [188]:
hist_revolving_univariate = px.histogram(data_frame=data,x='RevolvingUtilizationOfUnsecuredLines',nbins=150,
             log_y=False,orientation='v',opacity=0.5,marginal='box',template='ggplot2',
             color=TARGET)
hist_revolving_univariate.update_layout(title='Logged count of RevolvingUtilizationOfUnsecuredLines')
pio.show(hist_revolving_univariate,validate=False,renderer='iframe')

In [173]:
hist_revolving_univariate = px.histogram(data_frame=data,x='MonthlyIncome',nbins=150,
             log_y=False,orientation='v',opacity=0.5,marginal='box',template='ggplot2',
             color=TARGET)
hist_revolving_univariate.update_layout(title='Logged count of Income')
pio.show(hist_revolving_univariate,validate=False,renderer='iframe')

In [174]:
hist_revolving_univariate = px.histogram(data_frame=data,x='DebtRatio',nbins=150,
             log_y=False,orientation='v',opacity=0.5,marginal='box',template='ggplot2',
             color=TARGET)
hist_revolving_univariate.update_layout(title='Logged count of LogDebtRatio')
pio.show(hist_revolving_univariate,validate=False,renderer='iframe')

In [218]:
def log_debt_ratio(data, col="DebtRatio"):
    return np.log1p(data["DebtRatio"])


def log_income(data, col="MonthlyIncome"):
    return np.log1p(data["MonthlyIncome"])


def log_revolvingrate(data, col="RevolvingUtilizationOfUnsecuredLines"):
    return np.log1p(data["RevolvingUtilizationOfUnsecuredLines"])
data["LogDebtRatio"] = data.pipe(log_debt_ratio, col="DebtRatio")

data["LogIncome"] = data.pipe(log_income, col="MonthlyIncome")

data["LogRevolvingUtilizationOfUnsecuredLines"] = data.pipe(
    log_revolvingrate, col="RevolvingUtilizationOfUnsecuredLines"
)

In [219]:
#data = data.drop(['MonthlyIncome','DebtRatio','RevolvingUtilizationOfUnsecuredLines'],axis=1)

In [220]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn import metrics

In [221]:
compute_class_weight(class_weight='balanced',classes=np.unique(data[TARGET]),y=data[TARGET])

array([0.53581379, 7.48055057])

In [222]:
#define cross validation 

fold = StratifiedKFold(5)
def begin_cross_validation(X,Y,model,fold=fold) : 
    scoring_dict = {'auc':[]}
    for train_idx,test_idx in fold.split(X,Y) : 
        train_x,val_x = X.iloc[train_idx],X.iloc[test_idx]
        train_y,val_y = Y.iloc[train_idx],Y.iloc[test_idx]
        #train model 

        model.fit(train_x,train_y)
        y_pred = model.predict_proba(val_x)
        #calculate metrics 
        scoring_dict["auc"].append(metrics.roc_auc_score(val_y, y_pred[:, 1]))


    #print the scoring 
    print(f'''Scoring: AUC :{np.mean(scoring_dict['auc'])}
     ''')
    return np.mean(scoring_dict['auc'])

In [223]:

#split into X and Y 
X = data.drop(TARGET,axis=1)
y = data[TARGET]

X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.2)

In [226]:
model_logistic = LogisticRegression(penalty='elasticnet',class_weight='balanced')

In [227]:
begin_cross_validation(X,y,model_logistic)

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.