In [199]:
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin
# from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

#### 

#### create dataframe by reading the data set

In [272]:
df = pd.read_csv("titanic.csv", sep="\t", index_col='PassengerId')
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Train test split

In [128]:
features= [c for c in df.columns.values if c in ['Sex','Age','Cabin', 'SibSp']]

In [129]:
df.dtypes

Survived      int64
Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [130]:
df.select_dtypes(include=["object"]).columns

Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')

In [131]:
df[df.select_dtypes(include=["object"]).columns] = df[df.select_dtypes(include=["object"]).columns].astype('category')

In [132]:
df.select_dtypes(include=["object"]).columns

Index([], dtype='object')

In [133]:
df[features].dtypes

Sex      category
Age       float64
SibSp       int64
Cabin    category
dtype: object

In [134]:
target = "Survived"
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.33, random_state=2)

##### something worth learning, how to convert all 'object' data type columns to 'category'

In [None]:
# df = pd.concat([
#         df.select_dtypes([], ['object']),
#         df.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')
#         ], axis=1).reindex_axis(df.columns, axis=1)

### Preprocessing

#### starts the index from 0

In [66]:
class TransformIndex(TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X=None, y=None):
        return self
    def transform(self, X, y=None):
        X.index = np.arange(X.shape[0])
        return X

#### we noticed few NaNs in "Cabin" column

In [16]:
df["Cabin"].isna().sum()

125

#### Actually we have lots of (=125) NaNs
#### We would like to replace NaN by looking at the "Sex" column, It is replaced by the cabin which was frequent for a particular sex in that row

In [67]:
class ReplaceNansCabin(TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X=None, y=None):
        return self
    def transform(self, X, y=None):
        most_common = X.groupby('Sex')['Cabin'].apply(lambda x: x.value_counts().index[0]).reset_index()
        X.loc[(X['Cabin'].isnull()) & (X.Sex == 'female'), 'Cabin'] = most_common[most_common['Sex']=='female']['Cabin'][0]
        X.loc[(X['Cabin'].isnull()) & (X.Sex == 'male'), 'Cabin'] = most_common[most_common['Sex']=='male']['Cabin'][1]
        return X

#### We also replace NaNs in "Age" column by replacing it with the mean age of that particular sex

In [68]:
class ReplaceNansAge(TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X=None, y=None):
        return self
    def transform(self, X, y=None):
        X['Age'] = X["Age"].fillna(X.groupby("Sex")["Age"].transform('mean'))
        return X

In [51]:
df["SibSp"].isna().sum()

0

#### No NaNs in SibSp yay!

#### creates pipeline
<a id='pipeline'></a>

In [200]:
pipeline = Pipeline([
                ('transformsIndex', TransformIndex()),
                ('replaceNaNsInCabin', ReplaceNansCabin()),
                ('replaceNaNsInAge', ReplaceNansAge()),
                ('feat', feature_selection.SelectKBest(k=2)),
                ('classifier', RandomForestClassifier(random_state = 42))
            ])

In [201]:
pipeline.named_steps.keys()

dict_keys(['transformsIndex', 'replaceNaNsInCabin', 'replaceNaNsInAge', 'feat', 'classifier'])

In [202]:
pipeline.fit(X_train, y_train)


ValueError: could not convert string to float: 'male'

#### Apparently scikit learn works with numeric features and we need to convert strings to numerical values first
[link for details](https://datascience.stackexchange.com/questions/5226/strings-as-features-in-decision-tree-random-forest)

In [157]:
class DummyEncoder(TransformerMixin):
    def fit(self, X, y=None):
        self.columns_ = X.columns
        self.cat_cols_ = X.select_dtypes(include=["category"]).columns
        self.non_cat_cols_ = X.columns.drop(self.cat_cols_)
        self.cat_map_ = {col: X[col].cat for col in self.cat_cols_}
        
        left = len(self.non_cat_cols_)
        self.cat_blocks_ = {}
        for col in self.cat_cols_:
            right = left + len(X[col].cat.categories)
            self.cat_blocks_[col] = slice(left, right)
            left = right
        return self
    def transform(self, X, y=None):
        # transforms dataframe to numpy array
        return np.asarray(pd.get_dummies(X))
    def inverse_transform(self, trn, y=None):
        # transforms numpy array to dataframe
        numeric = pd.DataFrame(trn[:,:len(self.non_cat_cols_)], columns=self.non_cat_cols_)
        series = []
        for col, slice_ in self.cat_blocks_.items():
            codes = trn[:,slice_].argmax(1)
            cat = pd.Categorical.from_codes(codes,
                                           self.cat_map_[col].categories,
                                            ordered=self.cat_map_[col].ordered
                                           )
            series.append(pd.Series(cat, name=col))
        return pd.concat([numeric] + series, axis=1)[self.columns_]

[source video](https://www.youtube.com/watch?v=KLPtEBokqQ0) [notebook](https://github.com/TomAugspurger/mtg/blob/master/notes.ipynb) of DummyEncoder

#### few useful things to learn

category (numeric/categorical)code 

In [263]:
df["Sex"].astype('category').cat.codes

PassengerId
1      1
2      0
3      0
4      0
5      1
6      1
7      1
8      1
9      0
10     0
11     0
12     0
13     1
14     1
15     0
16     0
17     1
18     1
19     0
20     0
21     1
22     1
23     0
24     1
25     0
26     0
27     1
28     1
29     0
30     1
      ..
127    1
128    1
129    0
130    1
131    1
132    1
133    0
134    0
135    1
136    1
137    0
138    1
139    1
140    1
141    0
142    0
143    0
144    1
145    1
146    1
147    1
148    0
149    1
150    1
151    1
152    0
153    1
154    1
155    1
156    1
Length: 156, dtype: int8

labels assigned by astype('category').cat.codes

In [262]:
dict(enumerate(df["Sex"].astype('category').cat.categories))

{0: 'female', 1: 'male'}

if category code is known

In [264]:
df.Sex.astype('category').cat.categories[0]

'female'

if categories are ordered

In [265]:
df.Sex.astype('category').cat.ordered

False

#### adds dummy encoder to [above pipeline](#pipeline)

In [231]:
pipeline = Pipeline([
                ('transformsIndex', TransformIndex()),
                ('replaceNaNsInCabin', ReplaceNansCabin()),
                ('replaceNaNsInAge', ReplaceNansAge()),
                ('encodeStringCols', DummyEncoder()),
                ('feat', feature_selection.SelectKBest(k=2)),
                ('classifier', RandomForestClassifier(random_state = 42))
            ])

In [232]:
pipeline.named_steps.keys()

dict_keys(['transformsIndex', 'replaceNaNsInCabin', 'replaceNaNsInAge', 'encodeStringCols', 'feat', 'classifier'])

In [233]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'transformsIndex', 'replaceNaNsInCabin', 'replaceNaNsInAge', 'encodeStringCols', 'feat', 'classifier', 'feat__k', 'feat__score_func', 'classifier__bootstrap', 'classifier__class_weight', 'classifier__criterion', 'classifier__max_depth', 'classifier__max_features', 'classifier__max_leaf_nodes', 'classifier__min_impurity_decrease', 'classifier__min_impurity_split', 'classifier__min_samples_leaf', 'classifier__min_samples_split', 'classifier__min_weight_fraction_leaf', 'classifier__n_estimators', 'classifier__n_jobs', 'classifier__oob_score', 'classifier__random_state', 'classifier__verbose', 'classifier__warm_start'])

In [234]:
pipeline.get_params('classifier__class_weight')

{'memory': None,
 'steps': [('transformsIndex', <__main__.TransformIndex at 0x7f69b3a90110>),
  ('replaceNaNsInCabin', <__main__.ReplaceNansCabin at 0x7f69b3a90d90>),
  ('replaceNaNsInAge', <__main__.ReplaceNansAge at 0x7f69b3a900d0>),
  ('encodeStringCols', <__main__.DummyEncoder at 0x7f69b63a3190>),
  ('feat',
   SelectKBest(k=2, score_func=<function f_classif at 0x7f69b2e60710>)),
  ('classifier',
   RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                          max_depth=None, max_features='auto', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators='warn',
                          n_jobs=None, oob_score=False, random_state=42, verbose=0,
                          warm_start=False))],
 'verbose': False,
 'transformsIndex': <__main__.TransformIndex at 0x7f69b3a9011

In [235]:
pipeline.fit(X_train, y_train)

  f = msb / msw


Pipeline(memory=None,
         steps=[('transformsIndex',
                 <__main__.TransformIndex object at 0x7f69b3a90110>),
                ('replaceNaNsInCabin',
                 <__main__.ReplaceNansCabin object at 0x7f69b3a90d90>),
                ('replaceNaNsInAge',
                 <__main__.ReplaceNansAge object at 0x7f69b3a900d0>),
                ('encodeStringCols',
                 <__main__.DummyEncoder object at 0x7f69b63a3190>),
                ('feat',
                 SelectKBest(k=2,
                             score_func=<function f...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                

In [236]:
y_preds = pipeline.predict(X_test)
np.mean(y_preds == y_test)

0.7884615384615384

#### best features

In [237]:
best_features = pipeline.named_steps['feat']
pd.get_dummies(df[features]).columns[best_features.get_support()]

Index(['Sex_female', 'Sex_male'], dtype='object')

In [238]:
param_grid = {'classifier__max_depth': [50, 70],
                    'classifier__min_samples_leaf': [1,2]
                  }
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid)
grid_search.fit(X_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
  f = msb / msw
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/i

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
  f = msb / msw
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pand

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('transformsIndex',
                                        <__main__.TransformIndex object at 0x7f69b3a90110>),
                                       ('replaceNaNsInCabin',
                                        <__main__.ReplaceNansCabin object at 0x7f69b3a90d90>),
                                       ('replaceNaNsInAge',
                                        <__main__.ReplaceNansAge object at 0x7f69b3a900d0>),
                                       ('encodeStringCols',
                                        <__main__.DummyEncoder objec...
                                                               min_samples_leaf=1,
                                                               min_samples_split=2,
                                                               min_weight_fraction_leaf=0.0,
                                            

#### best values for hyperparameters

In [239]:
grid_search.best_params_

{'classifier__max_depth': 50, 'classifier__min_samples_leaf': 1}

#### refitting using best parameters

In [240]:
grid_search.refit

True

In [241]:
final_pipeline = grid_search.best_estimator_
final_classifier = final_pipeline.named_steps['classifier']
final_classifier

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=50, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [242]:
y_pred = grid_search.predict(X_test)

In [243]:
y_pred_prob = grid_search.predict_proba(X_test)

In [244]:
np.mean(y_pred == y_test)

0.7884615384615384

#### cross validation results

In [275]:
grid_search.cv_results_

{'mean_fit_time': array([0.98143339, 0.96861108, 0.99487996, 1.03760338]),
 'std_fit_time': array([0.00188417, 0.00079482, 0.02869424, 0.01552525]),
 'mean_score_time': array([0.96996546, 0.97038627, 1.00769273, 1.02550666]),
 'std_score_time': array([0.00260902, 0.00035787, 0.02073367, 0.01233885]),
 'param_classifier__max_depth': masked_array(data=[50, 50, 70, 70],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_classifier__min_samples_leaf': masked_array(data=[1, 2, 1, 2],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'classifier__max_depth': 50, 'classifier__min_samples_leaf': 1},
  {'classifier__max_depth': 50, 'classifier__min_samples_leaf': 2},
  {'classifier__max_depth': 70, 'classifier__min_samples_leaf': 1},
  {'classifier__max_depth': 70, 'classifier__min_samples_leaf': 2}],
 'split0_test_score': array([0.62857143, 0.62857143, 0.62857143, 0.6285714

#### output probabilities for each class

In [245]:
df_pred = pd.DataFrame(data=y_pred_prob, columns = grid_search.best_estimator_.named_steps['classifier'].classes_)
# df_pred = pd.concat([, preds], axis=1)
df_pred.set_index(y_test.index, inplace = True)
df_pred.head()

Unnamed: 0_level_0,0,1
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
13,0.857378,0.142622
4,0.242255,0.757745
99,0.242255,0.757745
7,0.857378,0.142622
143,0.242255,0.757745


#### Useful Links:
 - [deep dive into sklearn pipelines](https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines)
 - [Kevin Goetsch | Deploying Machine Learning using sklearn pipelines](https://www.youtube.com/watch?v=URdnFlZnlaE&t=8s)

In [246]:
final_pipeline

Pipeline(memory=None,
         steps=[('transformsIndex',
                 <__main__.TransformIndex object at 0x7f69b3af5490>),
                ('replaceNaNsInCabin',
                 <__main__.ReplaceNansCabin object at 0x7f69b3af58d0>),
                ('replaceNaNsInAge',
                 <__main__.ReplaceNansAge object at 0x7f69b3af59d0>),
                ('encodeStringCols',
                 <__main__.DummyEncoder object at 0x7f69b2deb510>),
                ('feat',
                 SelectKBest(k=2,
                             score_func=<function f...
                ('classifier',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=50,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
   

In [248]:
best_features = final_pipeline.named_steps['feat']
pd.get_dummies(df[features]).columns[best_features.get_support()]

Index(['Sex_female', 'Sex_male'], dtype='object')

another way of accomplishing above

In [253]:
select_indices = final_pipeline.named_steps['feat'].transform(
    np.arange(len(pd.get_dummies(df[features]).columns)).reshape(1, -1)
)
feature_names = pd.get_dummies(df[features]).columns[select_indices]

In [254]:
feature_names

Index([['Sex_female', 'Sex_male']], dtype='object')