# Titanic Machine Learning Solution using Pipelines

1. Collecting Data
2. Data Exploration
3. Feature Engineering
4. Model building
5. Testing

In [1]:
import pandas as pd

train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [2]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataframeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names]

In [7]:
train_X = train.drop('Survived', axis=1)
train_y = train['Survived']
train_X.head()
print(train_y)

0      0
1      1
2      1
3      1
4      0
5      0
6      0
7      0
8      1
9      1
10     1
11     1
12     0
13     0
14     0
15     1
16     0
17     1
18     0
19     1
20     0
21     1
22     1
23     1
24     0
25     1
26     0
27     0
28     1
29     0
      ..
861    0
862    1
863    0
864    0
865    1
866    1
867    0
868    0
869    1
870    0
871    1
872    0
873    0
874    1
875    1
876    0
877    0
878    0
879    1
880    1
881    0
882    0
883    0
884    0
885    0
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


In [8]:
Age_test = DataframeSelector(["Age", "Parch"])
Age_test.get_params()

{'attribute_names': ['Age', 'Parch']}

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer

imputer = Imputer(strategy = "mean")
num_pipeline = Pipeline([
    ('selector', DataframeSelector(["Age", "Parch","SibSp","Fare"])),
    ('imputer',imputer)
])

In [10]:
num_pipeline.fit_transform(train_X)

array([[ 22.        ,   0.        ,   1.        ,   7.25      ],
       [ 38.        ,   0.        ,   1.        ,  71.2833    ],
       [ 26.        ,   0.        ,   0.        ,   7.925     ],
       ..., 
       [ 29.69911765,   2.        ,   1.        ,  23.45      ],
       [ 26.        ,   0.        ,   0.        ,  30.        ],
       [ 32.        ,   0.        ,   0.        ,   7.75      ]])

In [11]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
encoder = OneHotEncoder(sparse=False)

In [18]:
encoder.fit_transform(train[['Sex', 'Embarked']])

ValueError: could not convert string to float: Q

In [23]:
selector = DataframeSelector(['Embarked']) 
TrainEmbarked = selector.fit_transform(train)
encoder.fit_transform(TrainEmbarked)

ValueError: could not convert string to float: Q

In [24]:
import sys
print(sys.executable)
print(sys.version)
print(sys.version_info)

C:\Users\Elijah Toppo\AppData\Local\Enthought\Canopy\edm\envs\User\python.exe
2.7.13 |Enthought, Inc. (x86_64)| (default, Mar  2 2017, 16:05:12) [MSC v.1500 64 bit (AMD64)]
sys.version_info(major=2, minor=7, micro=13, releaselevel='final', serial=0)


In [25]:
import sklearn

print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.19.1.


In [49]:
XX = train.Pclass
enc = LabelBinarizer()
enc.fit(XX)

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [68]:
cat_pipeline = Pipeline([
        ("select_cat", DataframeSelector(["Pclass", "Sex", "Embarked"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", LabelEncoder()),
    ])

In [65]:
cat_pipeline = Pipeline([
        ("select_cat", DataframeSelector(["Pclass", "Sex", "Embarked"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder()),
    ])

In [None]:
from sklearn.preprocessing import OneHotEncodereH