In [88]:
import pandas as pd

import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [70]:
raw_df = pd.read_csv('../datasets/cs-training.csv', index_col=0)

In [73]:
df_labels = raw_df['SeriousDlqin2yrs']

# Preprocessing 

In [79]:
STRAT_IMPUTER_NUM = "median"
STRAT_IMPUTER_CAT = "most_frequent"

num_pipeline = Pipeline([
  #  ('indicator', MissingIndicator()),
    ('imputer', SimpleImputer(strategy=STRAT_IMPUTER_NUM)),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
   # ('indicator', MissingIndicator()),
    ('imputer', SimpleImputer(strategy=STRAT_IMPUTER_CAT))
])

In [80]:
cat_features = ['age', 'NumberOfTime30-59DaysPastDueNotWorse',
                                     'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
                                     'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
                                     'NumberOfDependents']

num_features = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'MonthlyIncome']


full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
]) 

In [81]:
df_prepared = pd.DataFrame(full_pipeline.fit_transform(raw_df.copy()),
                           columns=num_features+cat_features)

In [82]:
df_prepared

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,DebtRatio,MonthlyIncome,age,NumberOfTime30-59DaysPastDueNotWorse,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,-0.021150,-0.172833,0.209579,45.0,2.0,13.0,0.0,6.0,0.0,2.0
1,-0.020385,-0.173168,-0.296226,40.0,0.0,4.0,0.0,0.0,0.0,1.0
2,-0.021582,-0.173186,-0.261937,38.0,1.0,2.0,1.0,0.0,0.0,0.0
3,-0.023281,-0.173210,-0.241922,30.0,0.0,5.0,0.0,0.0,0.0,0.0
4,-0.020585,-0.173215,4.435064,49.0,1.0,7.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
149995,-0.024055,-0.173117,-0.335014,74.0,0.0,4.0,0.0,1.0,0.0,0.0
149996,-0.023017,-0.172876,-0.064735,44.0,0.0,4.0,0.0,1.0,0.0,2.0
149997,-0.023232,1.725868,-0.079009,58.0,0.0,18.0,0.0,1.0,0.0,0.0
149998,-0.024218,-0.173228,-0.054495,30.0,0.0,4.0,0.0,0.0,0.0,0.0


# Model

## Logistic Regresion

In [91]:
log_reg = LogisticRegression(max_iter=1000)

In [92]:
scores = cross_val_score(log_reg, df_prepared, df_labels,
                        scoring='roc_auc', cv = 10)

In [96]:
def display_scores(scores):
    print(f"Scores: {scores}")
    print(f"Mean: {scores.mean()}")
    print(f"Std deviation: {scores.std()}")

In [97]:
display_scores(scores)

Scores: [0.68532553 0.69819088 0.69772525 0.69308236 0.69398869 0.70756727
 0.70018785 0.70360889 0.70642167 0.697069  ]
Mean: 0.6983167394288394
Std deviation: 0.0063213511009642415
