# Main imports

In [19]:
import os
import pandas as pd 
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
from tqdm import tqdm
from collections import defaultdict


pd.options.display.max_columns = 200
pd.options.display.max_rows = 5000

In [20]:
%pip install mljar-supervised

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


# Load DataFrame

In [21]:
path_to_df = "credit.csv"

credit_df = pd.read_csv(path_to_df)

credit_df['credit_history'].value_counts()

credit_history
good         530
critical     293
poor          88
very good     49
perfect       40
Name: count, dtype: int64

# Preprocessing step

## Credit History

In [22]:
credit_dict = {'critical' : 10000,
               'poor' : 1000,
               'good' : 100,
               'very good' : 10,
               'perfect' : 1}

credit_df["credit_history"] = credit_df['credit_history'].apply(lambda x : credit_dict.get(x, x))

In [23]:
credit_df['credit_history'].value_counts()


credit_history
100      530
10000    293
1000      88
10        49
1         40
Name: count, dtype: int64

In [24]:
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML


X_train, X_test, y_train, y_test = train_test_split(
    credit_df[credit_df.columns[:-1]], credit_df['default'], test_size=0.25
)

automl = AutoML()
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)

AutoML directory: AutoML_4
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'Linear', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models


AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 3 models
1_Baseline logloss 0.594754 trained in 0.67 seconds
2_DecisionTree logloss 0.751196 trained in 7.21 seconds
3_Linear logloss 0.542454 trained in 4.65 seconds
* Step default_algorithms will try to check up to 3 models
4_Default_Xgboost logloss 0.518423 trained in 2.19 seconds
5_Default_NeuralNetwork logloss 0.545083 trained in 2.24 seconds
6_Default_RandomForest logloss 0.551099 trained in 3.01 seconds
* Step ensemble will try to check up to 1 model
Ensemble logloss 0.510099 trained in 2.09 seconds
AutoML fit time: 30.86 seconds
AutoML best model: Ensemble


In [25]:
credit_df["default"].value_counts()

default
no     700
yes    300
Name: count, dtype: int64