In [5]:
import pandas as pd
import numpy as np

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [7]:
df = pd.read_csv('train.csv')

In [8]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## **PLAN**

Impute missing values in Age and Embarked columns using column transformer -> send the O/P to one hot encoder to encode sex and embarked -> send the O/P to scale -> Feature Selection (will take best 5 features only) -> Create a Decision Tree

In [9]:
df.drop(columns = ['PassengerId','Name', 'Ticket','Cabin'], inplace = True)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = ['Survived']),df['Survived'], test_size=0.2, random_state = 42)

In [11]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [12]:
y_train.sample(5)

345    1
62     0
475    0
370    1
483    1
Name: Survived, dtype: int64

### **Part 1 - Imputation Transformer - to imput missing values in age and embarked**

In [15]:
trf1 = ColumnTransformer([('impute_age', SimpleImputer(),[2]),      # here [2] is the index value of column Age in the dataframe because if you write name of Column it wont be recognized because after coming out of imputer function it will be returned as a numpy array and not dataframe and in numpy array there is no concept of column name
                          ('impute_embarked', SimpleImputer(strategy= 'most_frequent'),[6])   # Similarly [6] is the index of column embarked in our original dataframe 

], remainder = 'passthrough')

### **Part 2 - One Hot Encoding - to encode Sex and Embarked columns**

In [18]:
trf2 = ColumnTransformer([
    ('oho_sex_embarked', OneHotEncoder(sparse = False, handle_unknown = 'ignore'), [1,6])   # here [1,6] means 1 and 6th column and not 'from 1 to 6' column
], remainder = 'passthrough')

### **Part 3 - Scaling (Used Minmax scaler) on all the columns**

In [19]:
trf3 = ColumnTransformer([('scale',MinMaxScaler(),slice(0,10))])  # slice(0,10) means applying scaling on all the columns (including the ones formed after encoding)

### **Part 4 - Feature Selection**

In [22]:
trf4 = SelectKBest(score_func = chi2, k = 8)  #k = 8 means we are taking 8 most important columns

### **Creating Decision Tree Classifier**

In [25]:
trf5 = DecisionTreeClassifier()

### **Creating a pipeline using all the above parts**

In [26]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2', trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

Some Tutorials use make_pipeline instead of pipeline() - in pipeline() you need to mention name of all the pipelines like 'trf1', 'trf2' etc as written in above cell
Same applies for ColumnTransformer() and make_ColumnTransformer() 

### **Training through pipeline (that performing all the steps from part 1 to part 5**

In [27]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('trf1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('impute_age', SimpleImputer(),
                                                  [2]),
                                                 ('impute_embarked',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  [6])])),
                ('trf2',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('oho_sex_embarked',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  [1, 6])])),
                ('trf3',
                 ColumnTransformer(transformers=[('scale', MinMaxScaler(),
                                                  slice(0, 10, None))])),


### **Exploring pipeline**

In [29]:
"""Write this peice of code and then write the same code as written in the cell just above this cell and you will see the difference in visualization the same code shows.
All thanks to the code written in this block which is forcing the same code(Written in above cell) to show a better visualization"""

from sklearn import set_config
set_config(display = 'diagram')

In [30]:
# Same code written in block above shows better visuslization of the entire pipeline
pipe.fit(X_train, y_train)

In [32]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('oho_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x7f50b6c5c320>),
 'trf5': DecisionTreeClassifier()}

In [33]:
pipe.named_steps['trf1']

In [35]:
pipe.named_steps['trf1'].transformers_

[('impute_age', SimpleImputer(), [2]),
 ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder', 'passthrough', [0, 1, 3, 4, 5])]

### **Predictio on test data**

In [36]:
y_pred = pipe.predict(X_test)

In [37]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0])

### **Cross Validation using pipeline**

In [38]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv = 5 , scoring = 'accuracy').mean()

0.6391214419383433

### **Grid Search Using pipeline** - Hyperparameter tuning

In [40]:
params = {
    'trf5__max_depth':[1,2,3,4,5,None]  # here make sure you mention trf5__max_depth  i.e. name of the pipeline in which Model was created (here trf5) followed by two underscores
}

In [41]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe,params,cv=5,scoring = 'accuracy')
grid.fit(X_train,y_train)

In [42]:
grid.best_score_

0.6391214419383433

In [43]:
grid.best_params_

{'trf5__max_depth': 2}

### **Exporting the pipeline**

In [44]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))