In [2]:
import pandas as pd
import seaborn as sns
from ipynb2md import components

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, classification_report

components.init_notebook()

In [3]:
X = sns.load_dataset('penguins')
y = X.pop('species')
X.sample(5)

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
193,Dream,46.2,17.5,187.0,3650.0,Female
225,Biscoe,46.5,13.5,210.0,4550.0,Female
7,Torgersen,39.2,19.6,195.0,4675.0,Male
186,Dream,49.7,18.6,195.0,3600.0,Male
267,Biscoe,50.5,15.9,225.0,5400.0,Male


# Modeling

In [6]:
preprocess = ColumnTransformer([
    ('num', SimpleImputer(fill_value=-1), ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']),
    ('cat', OneHotEncoder(drop='first'), ['island', 'sex'])
])
pipe = make_pipeline(preprocess, DecisionTreeClassifier())
print(pipe)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(fill_value=-1),
                                                  ['bill_length_mm',
                                                   'bill_depth_mm',
                                                   'flipper_length_mm',
                                                   'body_mass_g']),
                                                 ('cat',
                                                  OneHotEncoder(drop='first'),
                                                  ['island', 'sex'])])),
                ('decisiontreeclassifier', DecisionTreeClassifier())])


In [15]:
tab = components.Tab()

kfold = KFold(n_splits=5, shuffle=True)
for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
    train_X, train_y = X.iloc[train_index], y.iloc[train_index]
    test_X, test_y = X.iloc[test_index], y.iloc[test_index]

    pipe.fit(train_X, train_y)
    pred_train_y = pipe.predict(train_X)
    pred_test_y = pipe.predict(test_X)
    
    tab.add(
        f'Split {i}', 
        '<h3>Train</h3>' + pd.DataFrame(
            classification_report(train_y, pred_train_y, output_dict=True)
        ).round(2).to_html() + '<br><h3>Test</h3>' + pd.DataFrame(
            classification_report(test_y, pred_test_y, output_dict=True)
        ).round(2).to_html()
    )

tab

Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0
support,118.0,58.0,99.0,1.0,275.0,275.0

Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,0.91,1.0,0.99,0.97,0.99
recall,0.97,1.0,1.0,0.99,0.99,0.99
f1-score,0.99,0.95,1.0,0.99,0.98,0.99
support,34.0,10.0,25.0,0.99,69.0,69.0

Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0
support,131.0,52.0,92.0,1.0,275.0,275.0

Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0
support,21.0,16.0,32.0,1.0,69.0,69.0

Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0
support,120.0,54.0,101.0,1.0,275.0,275.0

Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,0.88,0.92,0.94,0.93,0.95
recall,0.88,1.0,1.0,0.94,0.96,0.94
f1-score,0.93,0.93,0.96,0.94,0.94,0.94
support,32.0,14.0,23.0,0.94,69.0,69.0

Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0
support,123.0,52.0,100.0,1.0,275.0,275.0

Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,0.97,1.0,0.96,0.97,0.98,0.97
recall,1.0,0.94,0.96,0.97,0.97,0.97
f1-score,0.98,0.97,0.96,0.97,0.97,0.97
support,29.0,16.0,24.0,0.97,69.0,69.0

Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0
support,116.0,56.0,104.0,1.0,276.0,276.0

Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,0.97,1.0,0.95,0.97,0.97,0.97
recall,0.97,0.92,1.0,0.97,0.96,0.97
f1-score,0.97,0.96,0.98,0.97,0.97,0.97
support,36.0,12.0,20.0,0.97,68.0,68.0
