This is the code for the kernel I submitted to Kaggle.

### 1. Import all of the libraries we will use

In [1]:
import math
import numpy as np
import pandas as pd
from scipy.stats import randint

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import Imputer, OneHotEncoder, StandardScaler

### 2. Load the training and testing data

In [2]:
train = pd.read_csv("data/train.csv", index_col=0)
test_X = pd.read_csv("data/test.csv", index_col=0)
train_X = train.drop("Survived", axis=1).copy()
train_y = train["Survived"].copy()

### 3. Make dataset transformers
Since we are trying to work on our scikit-learn skills, we make a number of transformers to do our feature generator. These could be reused in other projects (though to be honest, some of them are too specific to be useful outside this project. I would (and will) generalize them in the future if their ideas are useful in other projects.

In [3]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    """Returns a subset of columns of a DataFrame"""
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.loc[:, self.attribute_names]
    
    
class CategoryFactorizer(BaseEstimator, TransformerMixin):
    """
    Takes a single column DataFrame of type object (usually
    strings) and converts it into integer categories.
    """
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        encoded = X.factorize()[0].reshape(-1, 1)
        return encoded
    
    
class GenerateTitles(BaseEstimator, TransformerMixin):
    """Adds a Title column to the Titanic DataFrame using the Name column"""
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Check if the titles have already been added
        if "Title" in X.columns:
            return X
        X["Title"] = X.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
        title_mapping = {
            "Mlle": "Miss", "Mme": "Mrs", "Ms": "Miss", "Capt": "Military", 
            "Col": "Military", "Major": "Military",  "Countess": "Nobility", 
            "Don": "Nobility", "Dr": "Medical", "Jonkheer": "Nobility", 
            "Lady": "Nobility", "Rev": "Religous", "Sir": "Nobility",
        }
        for from_title, to_title in title_mapping.items():    
            X.loc[X.Title == from_title, "Title"] = to_title
        return X
    
    
class SumColumns(BaseEstimator, TransformerMixin):
    """
    Creates a new column for the sum of
    the values in a given set of columns
    """
    def __init__(self, in_cols, out_col):
        self.in_cols = in_cols
        self.out_col = out_col
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.out_col] = X.loc[:, self.in_cols].sum(axis=1)
        return X

### 4. Define piplelines for feature generation
Using our custom transformers and several of scikit-learn's built in transformers, we define methods to generate pipelines that will generate specific features. Some of the pipelines are generic enough to generate features of a certain family (turning string columns into one hot vectors) while others are specific to a column (generating one hot vectors for the titles).

In [4]:
def family_size_pipeline(to_one_hots=False):
    pieces = [
        ('summer', SumColumns(["SibSp", "Parch"], "Family_Size")),
        ('selector', DataFrameSelector(["Family_Size"]))
    ]
    if to_one_hots:
        pieces.append(('one_hot', OneHotEncoder()))
    return Pipeline(pieces)


def numerics_pipeline(attribute_names, scaling=False):
    pieces = [
        ('selector', DataFrameSelector(attribute_names)),
        ('imputer', Imputer(strategy="median")),
    ]
    if scaling:
        pieces.append(('std_scaler', StandardScaler()))
    return Pipeline(pieces)


def ordinal_category_pipeline(attribute_name):
    return Pipeline([
        ('selector', DataFrameSelector(attribute_name)),
        ('one_hot', OneHotEncoder())
    ])


def str_category_pipeline(attribute_name):
    return Pipeline([
        ('selector', DataFrameSelector(attribute_name)),
        ('factorize', CategoryFactorizer()),
        ('imputer', Imputer(missing_values=-1, strategy="most_frequent")),  # For Embarked
        ('one_hot', OneHotEncoder())
    ])


def title_pipeline():
    return Pipeline([
        ('titler', GenerateTitles()),
        ('selector', DataFrameSelector("Title")),
        ('factorize', CategoryFactorizer()),
        ('one_hot', OneHotEncoder()),
    ])

### 5. Define the full final pipeline
We use scikit-learn's FeatureUnion to combine the outputs of our various pipelines into a single dataframe to feed into our model.

In [5]:
final_pipeline = FeatureUnion(transformer_list=[
    ("family_size_pipeline", family_size_pipeline()),
    ("num_scale_pipeline", numerics_pipeline(["Fare", "Age"], scaling=True)),
    ("ord_to_one_hot", ordinal_category_pipeline(["Pclass"])),
    ("sex_pipeline", str_category_pipeline("Sex")),
    ("embarked_pipeline", str_category_pipeline("Embarked")),
    ("title_generation", title_pipeline()),
])

### 6. Train our model
We will use a random forest classifier as it seems to be the default model people are using right now (and it had the best performance for this set of features). We perform a randomized search with cross validation over two of the hyperparameters for our model. We then select the best model as specified by the default scoring method (accuracy for random forest classifiers).

In [6]:
train_X_prep = final_pipeline.fit_transform(train_X)
params = {
    'max_depth': randint(150, 350),
    'min_samples_split': randint(10, 25),
}
rnd_clf = RandomForestClassifier(n_estimators=500)
forest_search = RandomizedSearchCV(rnd_clf, param_distributions=params, n_iter=400, n_jobs=6)
forest_search.fit(train_X_prep, train_y)
final_model = forest_search.best_estimator_

### 7. Prepare our submission
We transform the test data using our feature pipeline and then use our best model to predict survival. The predictions are then written out to a csv file in the format Kaggle expects.

In [9]:
test_X_prep = final_pipeline.fit_transform(test_X)
predictions = final_model.predict(test_X_prep)
submission_df = pd.DataFrame(
    data=predictions,
    index=test_X.index,
    columns=['Survived']
)
submission_df.to_csv('submission.csv')