# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from joblib import dump

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from  sklearn.pipeline  import  Pipeline
from  sklearn  import  set_config
set_config(display='diagram')

In [3]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Custom functions

In [4]:
def dataset_parameters(df,target):
    df_features = df.drop([target], axis=1)
    data_type_numerical = ['int64', 'float64']
    data_type_object = ['object', 'bool', 'category']
    data_type_date = ['datetime64']
    categorical_features  = list(filter(lambda x: df_features[x].dtype in data_type_object, df_features.columns))
    numerical_features = list(filter(lambda x: df_features[x].dtype in data_type_numerical, df_features.columns))
    date_features = list(filter(lambda x: df_features[x].dtype in data_type_date, df_features.columns))
    all_features  = numerical_features  + categorical_features + date_features 
    return categorical_features, numerical_features, date_features, all_features,target 

# Loading and set data

In [5]:
with open('penguins_df.pkl', 'rb') as f:
    df = pickle.load(f)

In [6]:
df

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,species
0,Torgersen,39.1,18.7,181.0,3750.0,MALE,Adelie
1,Torgersen,39.5,17.4,186.0,3800.0,FEMALE,Adelie
2,Torgersen,40.3,18.0,195.0,3250.0,FEMALE,Adelie
3,Torgersen,36.7,19.3,193.0,3450.0,FEMALE,Adelie
4,Torgersen,39.3,20.6,190.0,3650.0,MALE,Adelie
...,...,...,...,...,...,...,...
328,Biscoe,47.2,13.7,214.0,4925.0,FEMALE,Gentoo
329,Biscoe,46.8,14.3,215.0,4850.0,FEMALE,Gentoo
330,Biscoe,50.4,15.7,222.0,5750.0,MALE,Gentoo
331,Biscoe,45.2,14.8,212.0,5200.0,FEMALE,Gentoo


In [7]:
categorical_features, numerical_features, date_features, all_features, target = dataset_parameters(df,'species')

In [8]:
X = df[all_features]

In [9]:
y = df[target]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13) 

# Setup pipeline, train and test with best hyperparameters

In [11]:
categorical_transformer = Pipeline(steps=[
    ('one' , OneHotEncoder())
])

In [12]:
numerical_transformer = Pipeline(steps=[
    ('scale', RobustScaler()),
])

In [13]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

In [14]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', AdaBoostClassifier(base_estimator=LogisticRegression()))
])

In [15]:
pipeline.fit(X_train,  y_train)

In [16]:
pipeline.score(X_test,y_test)

1.0

# Save pipeline to disk

In [17]:
dump(pipeline, 'pipeline.joblib')

['pipeline.joblib']