In [1]:
import os
from joblib import load
import pandas as pd
import seaborn as sns
from tpot import TPOTClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import IsolationForest
from sklearn.metrics import RocCurveDisplay
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler
)
from sklearn.metrics import classification_report, roc_auc_score
from warnings import filterwarnings


from src.data import (
    load_data,
    create_auto_eda
)

import src.utils as u

sns.set()
%matplotlib inline
filterwarnings('ignore')



In [2]:
df = load_data()


Unnamed: 0,account_status,duration_in_month,credit_history,purpose,credit_amount,savings,current_employment_since,installment_rate,status_and_sex,other_debtors,...,property,age,other_installment_plans,housing,n_existing_credits_in_bank,job,n_people_liable,telephone,foreign_worker,label
0,... < 0 DM,6,critical account/ other credits existing (not ...,radio/television,1169,unknown/ no savings account,.. >= 7 years,4,male : single,none,...,real estate,67,none,own,2,skilled employee / official,1,"yes, registered under the customers name",yes,Good
1,0 <= ... < 200 DM,48,existing credits paid back duly till now,radio/television,5951,... < 100 DM,1 <= ... < 4 years,2,female : divorced/separated/married,none,...,real estate,22,none,own,1,skilled employee / official,1,none,yes,Bad
2,no checking account,12,critical account/ other credits existing (not ...,education,2096,... < 100 DM,4 <= ... < 7 years,2,male : single,none,...,real estate,49,none,own,1,unskilled - resident,2,none,yes,Good
3,... < 0 DM,42,existing credits paid back duly till now,furniture/equipment,7882,... < 100 DM,4 <= ... < 7 years,2,male : single,guarantor,...,if not real estate : building society savings ...,45,none,for free,1,skilled employee / official,2,none,yes,Good
4,... < 0 DM,24,delay in paying off in the past,car (new),4870,... < 100 DM,1 <= ... < 4 years,3,male : single,none,...,unknown / no property,53,none,for free,2,skilled employee / official,2,none,yes,Bad


In [6]:
df[['sex', 'status']] = df['status_and_sex'].str.split(' : ', expand=True)
df = df.drop('status_and_sex', axis=1)

df[['sex', 'status']] = df[['sex', 'status']].astype('category')

In [8]:
columns_to_drop = ['credit_history', 'property',  'housing', 'foreign_worker', 'other_debtors', 'other_installment_plans']

In [9]:
X = df.drop(columns_to_drop + ['label'], axis=1).copy()
y = df['label'].copy()

In [10]:
categorical_columns = X.select_dtypes(include=['category', object]).columns
numerical_columns = X.select_dtypes(exclude=['category', object]).columns

In [11]:
ordered_categories = [
    ['unemployed',
 '... < 1 year',
 '1 <= ... < 4 years',
 '4 <= ... < 7 years',
     '.. >= 7 years',]]

ordinal_columns = ['current_employment_since']
one_hot_columns = [i for i in categorical_columns if i not in ordinal_columns]
continuous_columns =['duration_in_month', 'credit_amount', 'age']

In [12]:
ordinal_transformer = OrdinalEncoder(categories=ordered_categories)
one_hot_transformer = OneHotEncoder(sparse=False)

transformer = ColumnTransformer([('ordinal', ordinal_transformer, ordinal_columns),
                                 ('onehot', one_hot_transformer, one_hot_columns)],
                                remainder='passthrough')

temp = transformer.fit_transform(X)

In [13]:
X = pd.DataFrame(temp, columns=transformer.get_feature_names_out())
y = y.replace({'Good': 0, 'Bad': 1})

In [14]:
outlier_detector = IsolationForest(random_state=42)
outlier_detector.fit(X.values)

In [16]:
outlier_detector1 = IsolationForest(contamination=0.01, random_state=42)
outlier_detector1.fit(X.values)

In [19]:
outliers = (outlier_detector1.predict(X.values) == -1)
X, y = X[~outliers], y[~outliers]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   stratify=y,
                                                   random_state=42,
                                                   test_size=0.2)
