In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.metrics import *
from sklearn.preprocessing import *
from sklearn.compose import *
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.pipeline import *

In [2]:
df=sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
df.shape

(891, 15)

In [4]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [5]:
df.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [6]:
x,y=df.drop(["survived",'pclass','deck','embark_town','alive','adult_male'],axis=1),df['survived']
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)
x.select_dtypes(include="number").columns

Index(['age', 'sibsp', 'parch', 'fare'], dtype='object')

In [7]:
x.columns

Index(['sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who',
       'alone'],
      dtype='object')

In [8]:
##col selectors
numerical_col=X_train.select_dtypes(include='number').columns
nominal_col=['sex','embarked','alone']
ordinal_col=['class','who']
ord_class_order=['Third',"Second","First"]
ord_who_order=['man','woman','child']
orinal_order=[ord_class_order,ord_who_order]
numerical_col

Index(['age', 'sibsp', 'parch', 'fare'], dtype='object')

In [9]:
trf1=ColumnTransformer(transformers=[
    ('missing_numerical',SimpleImputer(strategy='mean'),numerical_col),
    ('missing_categorical',SimpleImputer(strategy='most_frequent'),make_column_selector(dtype_include='object'))
],remainder='passthrough',verbose_feature_names_out=False).set_output(transform='pandas')

trf2=ColumnTransformer(transformers=[
    ('Ohe',OneHotEncoder(handle_unknown='ignore',sparse_output=False),nominal_col),
    ('Ode',OrdinalEncoder(categories=orinal_order),ordinal_col)
],remainder='passthrough',verbose_feature_names_out=False).set_output(transform='pandas')

trf3=ColumnTransformer(transformers=[
    ('scaler',MinMaxScaler(),['age','fare'])
])

pipe=Pipeline(steps=[
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('classifier',DecisionTreeClassifier())
])


In [10]:
pipe.fit(X_train,y_train)

0,1,2
,steps,"[('trf1', ...), ('trf2', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('missing_numerical', ...), ('missing_categorical', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('Ohe', ...), ('Ode', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['Third', 'Second', ...], ['man', 'woman', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,transformers,"[('scaler', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [11]:
print(classification_report(y_test,pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.69      0.80      0.74       157
           1       0.64      0.49      0.55       111

    accuracy                           0.67       268
   macro avg       0.66      0.64      0.65       268
weighted avg       0.67      0.67      0.66       268

