# Titanic Machine Learning Solution using
# Pipelines to get incrementally better

1. Collecting Data
2. Data Exploration
3. Feature Engineering
4. Model building
5. Testing

In [13]:
# standard
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

# Scikit Learn tools
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import learning_curve

# Machine Learning
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

In [3]:
df_train = pd.read_csv('input/train.csv')
df_test = pd.read_csv('input/test.csv')
combine = [df_train, df_test]
df_train.name = 'Train'
df_test.name = 'Test'

In [11]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=14)
target = df_train.Survived

In [14]:
def get_survived(estimator, kfolds, data, target):
    scores = []
    train = data.copy()
    for i, (train_index, test_index) in enumerate(kfolds.split(target)):
        training = train.iloc[train_index,:]
        valid = train.iloc[test_index,:]
        tr_label = target.iloc[train_index]
        val_label = target.iloc[test_index]
        estimator.fit(training, tr_label)
        pred = estimator.predict(valid)
        score = accuracy_score(y_pred=pred, y_true=val_label)
        scores.append(score)
    return round(np.mean(scores),3)

def get_coef(clsf, features):
    imp = clsf.steps[1][1].coef_.tolist()
    results = pd.DataFrame({'Features':features,'Score':imp})
    results = results.sort_values(by='Score', ascending=False)
    return results

## 1. Baseline
As baseline we will predict all women survived.

In [19]:
scores = []

for i, (train_index, test_index) in enumerate(kfolds.split(target)):
    df_train['prediction'] = 0
    print("Fold {} in progress".format(i))
    result = df_train['Survived'].iloc[test_index]
    df_train['prediction'].loc[df_train.Sex == 'female'] = 1
    pred = df_train['prediction'].iloc[test_index]
    score = accuracy_score(y_pred=pred, y_true=result)
    scores.append(score)
    print("Score : {}".format(round(score,3)))
    print("-"*40)
    
print("Baseline: {} +- {}".format(round(np.mean(scores),3), round(np.std(scores),3)))
    
    

Fold 0 in progress
Score : 0.8
----------------------------------------
Fold 1 in progress
Score : 0.775
----------------------------------------
Fold 2 in progress
Score : 0.865
----------------------------------------
Fold 3 in progress
Score : 0.708
----------------------------------------
Fold 4 in progress
Score : 0.798
----------------------------------------
Fold 5 in progress
Score : 0.764
----------------------------------------
Fold 6 in progress
Score : 0.876
----------------------------------------
Fold 7 in progress
Score : 0.719
----------------------------------------
Fold 8 in progress
Score : 0.742
----------------------------------------
Fold 9 in progress
Score : 0.82
----------------------------------------
Baseline: 0.787 +- 0.054


In [91]:
train_X.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [58]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = "mean")
num_pipeline = Pipeline([
    ('selector', DataframeSelector(["Age", "Parch","SibSp","Fare"])),
    ('imputer',imputer)
])

In [59]:
num_pipeline.fit_transform(train_X)

array([[22.        ,  0.        ,  1.        ,  7.25      ],
       [38.        ,  0.        ,  1.        , 71.2833    ],
       [26.        ,  0.        ,  0.        ,  7.925     ],
       ...,
       [29.69911765,  2.        ,  1.        , 23.45      ],
       [26.        ,  0.        ,  0.        , 30.        ],
       [32.        ,  0.        ,  0.        ,  7.75      ]])

In [60]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [61]:
from sklearn.preprocessing import OneHotEncoder

In [62]:
cat_pipeline = Pipeline([
        ("select_cat", DataframeSelector(["Pclass", "Sex", "Embarked"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

In [63]:
cat_pipeline.fit_transform(train_X)

array([[0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 1., 0.]])

In [64]:
from sklearn.pipeline import FeatureUnion

preprocess_pipeline = FeatureUnion(transformer_list = [
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

In [66]:
X_train = preprocess_pipeline.fit_transform(train_X)
X_test = preprocess_pipeline.fit_transform(test_X)

In [68]:
from sklearn.svm import SVC

svm_clf = SVC(gamma='scale')
svm_clf.fit(X_train, train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [77]:
from sklearn.model_selection import cross_val_score

svm_scores = cross_val_score(svm_clf, X_train, train_y, cv=10)
svm_scores.mean()

0.7095062989445012

In [81]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=100)
rnd_clf.fit(X_train, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [82]:
rnd_scores = cross_val_score(rnd_clf, X_train, train_y, cv=10)
rnd_scores.mean()

0.8115821132675066

In [84]:
pred = rnd_clf.predict(X_test)

array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [110]:
submission = pd.DataFrame({
    'PassengerId':uid,
    'Survived':pred
})
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1


In [111]:
submission.to_csv('submission_pipeline.csv',index=False)