In [30]:
import pandas as pd
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, classification_report

In [31]:
X = sns.load_dataset('penguins')
y = X.pop('species')
X.sample(5)

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
333,Biscoe,51.5,16.3,230.0,5500.0,Male
9,Torgersen,42.0,20.2,190.0,4250.0,
168,Dream,50.3,20.0,197.0,3300.0,Male
86,Dream,36.3,19.5,190.0,3800.0,Male
308,Biscoe,47.5,14.0,212.0,4875.0,Female


# Modeling

In [32]:
preprocess = ColumnTransformer([
    ('num', SimpleImputer(fill_value=-1), ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']),
    ('cat', OneHotEncoder(drop='first'), ['island', 'sex'])
])
pipe = make_pipeline(preprocess, DecisionTreeClassifier())
pipe

In [42]:
for i, (train_index, test_index) in enumerate(KFold(shuffle=True).split(X, y)):
    train_X, train_y = X.iloc[train_index], y.iloc[train_index]
    test_X, test_y = X.iloc[test_index], y.iloc[test_index]

    pipe.fit(train_X, train_y)
    pred_train_y = pipe.predict(train_X)
    pred_test_y = pipe.predict(test_X)

    print(f'Fit and test on {i} split')
    print()
    print('Train')
    display(pd.DataFrame(
        classification_report(train_y, pred_train_y, output_dict=True)
    ).round(2))
    
    print('Test')
    display(pd.DataFrame(
        classification_report(test_y, pred_test_y, output_dict=True)
    ).round(2))

Fit and test on 0 split

Train


Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0
support,118.0,54.0,103.0,1.0,275.0,275.0


Test


Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,0.87,1.0,0.95,0.91,0.94,0.92
recall,0.97,0.64,1.0,0.91,0.87,0.91
f1-score,0.92,0.78,0.98,0.91,0.89,0.91
support,34.0,14.0,21.0,0.91,69.0,69.0


Fit and test on 1 split

Train


Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0
support,118.0,59.0,98.0,1.0,275.0,275.0


Test


Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,0.94,0.82,1.0,0.94,0.92,0.95
recall,0.94,1.0,0.92,0.94,0.95,0.94
f1-score,0.94,0.9,0.96,0.94,0.93,0.94
support,34.0,9.0,26.0,0.94,69.0,69.0


Fit and test on 2 split

Train


Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0
support,125.0,53.0,97.0,1.0,275.0,275.0


Test


Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,0.88,0.96,0.96,0.95,0.96
recall,0.89,1.0,1.0,0.96,0.96,0.96
f1-score,0.94,0.94,0.98,0.96,0.95,0.96
support,27.0,15.0,27.0,0.96,69.0,69.0


Fit and test on 3 split

Train


Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0
support,125.0,52.0,98.0,1.0,275.0,275.0


Test


Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0
support,27.0,16.0,26.0,1.0,69.0,69.0


Fit and test on 4 split

Train


Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0
support,122.0,54.0,100.0,1.0,276.0,276.0


Test


Unnamed: 0,Adelie,Chinstrap,Gentoo,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0
support,30.0,14.0,24.0,1.0,68.0,68.0
