In [1]:
import pandas as pd
import numpy as np

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier



In [42]:
df=pd.read_csv('train.csv')

In [43]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [44]:
x_train,x_test,y_train,y_test=train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state=42)

In [45]:
x_train


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


In [46]:
#imputation
trf1=ColumnTransformer(
    [('impute_age',SimpleImputer(),[2]),
     ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6]),
     ],remainder='passthrough'
)

In [79]:
#  ohe
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [80]:
x_transformed = trf2.fit_transform(x_train)
x_transformed[:5]


array([[ 0.    ,  1.    ,  0.    ,  0.    ,  1.    ,  0.    ,  1.    ,
        45.5   ,  0.    ,  0.    , 28.5   ],
       [ 0.    ,  1.    ,  0.    ,  0.    ,  1.    ,  0.    ,  2.    ,
        23.    ,  0.    ,  0.    , 13.    ],
       [ 0.    ,  1.    ,  0.    ,  0.    ,  1.    ,  0.    ,  3.    ,
        32.    ,  0.    ,  0.    ,  7.925 ],
       [ 0.    ,  1.    ,  0.    ,  0.    ,  1.    ,  0.    ,  3.    ,
        26.    ,  1.    ,  0.    ,  7.8542],
       [ 1.    ,  0.    ,  0.    ,  0.    ,  1.    ,  0.    ,  3.    ,
         6.    ,  4.    ,  2.    , 31.275 ]])

In [69]:
# scaling
trf3=ColumnTransformer(
    [('scale',MinMaxScaler(),slice(0,10)),
     ],remainder='passthrough'
)

In [70]:
#  Feature selection
trf4=SelectKBest(score_func=chi2,k=8)

In [71]:
#training MOdel
trf5=DecisionTreeClassifier()

In [81]:
numeric_cols = [2]          # Age
categorical_cols = [1, 6]   # Sex, Embarked



In [82]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',
         Pipeline([
             ('imputer', SimpleImputer()),
             ('scaler', MinMaxScaler())
         ]),
         numeric_cols),

        ('cat',
         Pipeline([
             ('imputer', SimpleImputer(strategy='most_frequent')),
             ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
         ]),
         categorical_cols)
    ]
)


In [83]:
pipe = Pipeline([
    ('prep', preprocessor),
    ('select', SelectKBest(score_func=chi2, k=8)),
    ('model', DecisionTreeClassifier())
])


#Create Pilelinging


In [77]:
# pipeling object
pipe=Pipeline(
    [
     ('trf1',trf1),
     ('trf2',trf2),
     ('trf3',trf3),
     ('trf4',trf4),
     ('trf5',trf5)
    ])

alternative

In [73]:
pipe2=make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [84]:
pipe.fit(x_train,y_train)



#Exploring pipe


In [86]:
pipe.named_steps

{'prep': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('imputer', SimpleImputer()),
                                                  ('scaler', MinMaxScaler())]),
                                  [2]),
                                 ('cat',
                                  Pipeline(steps=[('imputer',
                                                   SimpleImputer(strategy='most_frequent')),
                                                  ('ohe',
                                                   OneHotEncoder(handle_unknown='ignore',
                                                                 sparse_output=False))]),
                                  [1, 6])]),
 'select': SelectKBest(k=8, score_func=<function chi2 at 0x7a2b446cc0e0>),
 'model': DecisionTreeClassifier()}

In [89]:
pipe.named_steps['prep'].transformers_

[('num',
  Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', MinMaxScaler())]),
  [2]),
 ('cat',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                  ('ohe',
                   OneHotEncoder(handle_unknown='ignore', sparse_output=False))]),
  [1, 6]),
 ('remainder', 'drop', [0, 3, 4, 5])]

In [90]:
pipe.named_steps['prep'].transformers_[0][1]

In [97]:
pipe.named_steps['prep'].transformers_[1][1]

In [100]:
y_pred=pipe.predict(x_test)

In [102]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7206703910614525

# Cross Validation


In [107]:
#  Fro cross validation cv-5 mean ve cross validat for 5 times and take its mean and assume it as our final value
#  whole procees form Train_Test_Splite is reapeted that many times
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,x_train,y_train,cv=5,scoring='accuracy').mean()



np.float64(0.7556682753865853)

#GridSearch using Pipeline

In [108]:
#  its like tuning of a parameter we check for different depth of a parameter (tuning of a hyper parameter)
# to imrove performance  (max_depth) this value will make changes inn performance of parameter it may be positive or negative also
# its like tuning nobe

In [111]:
# gridsearchcv
params = {
    'model__max_depth':[1,2,3,4,5,None]
}

In [112]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(x_train, y_train)



In [113]:
grid.best_score_

np.float64(0.7934994582881908)

In [114]:
grid.best_params_

{'model__max_depth': 3}

#Exporting the pipeline

In [115]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))

# on other file

In [116]:
import pickle
import numpy as np

In [117]:
pipe = pickle.load(open('pipe.pkl','rb'))


In [118]:
# Assume user input
test_input2 = np.array([2, 'male', 31.0, 0, 0, 10.5, 'S'],dtype=object).reshape(1,7)
pipe.predict(test_input2)



array([0])

we will not have to make change on this side of the code even if we change the logic from back we can manage by imoprting that file