# Titanic: Machine Learning from Disaster - v4
## Kaggle Competition - Christian Bramwell

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

### Functions and Classes

In [2]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    
    '''Create a DataFrame selector to choose between numerical or categorical features.'''
    
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [3]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std. Deviation:", scores.std())

In [4]:
def classification_Reports(y_test, predictions):
    
    print("Classification Report")
    print("")
    print(classification_report(y_test,predictions))
    
    print("Confusion Matrix")
    print("")
    print(confusion_matrix(y_test,predictions))

In [5]:
# Categorical Encoder is in Scikit-Learn -v 0.20.0

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """Encode categorical features as a numeric array.
    The input to this transformer should be a matrix of integers or strings,
    denoting the values taken on by categorical (discrete) features.
    The features can be encoded using a one-hot aka one-of-K scheme
    (``encoding='onehot'``, the default) or converted to ordinal integers
    (``encoding='ordinal'``).
    This encoding is needed for feeding categorical data to many scikit-learn
    estimators, notably linear models and SVMs with the standard kernels.
    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
    Parameters
    ----------
    encoding : str, 'onehot', 'onehot-dense' or 'ordinal'
        The type of encoding to use (default is 'onehot'):
        - 'onehot': encode the features using a one-hot aka one-of-K scheme
          (or also called 'dummy' encoding). This creates a binary column for
          each category and returns a sparse matrix.
        - 'onehot-dense': the same as 'onehot' but returns a dense array
          instead of a sparse matrix.
        - 'ordinal': encode the features as ordinal integers. This results in
          a single column of integers (0 to n_categories - 1) per feature.
    categories : 'auto' or a list of lists/arrays of values.
        Categories (unique values) per feature:
        - 'auto' : Determine categories automatically from the training data.
        - list : ``categories[i]`` holds the categories expected in the ith
          column. The passed categories are sorted before encoding the data
          (used categories can be found in the ``categories_`` attribute).
    dtype : number type, default np.float64
        Desired dtype of output.
    handle_unknown : 'error' (default) or 'ignore'
        Whether to raise an error or ignore if a unknown categorical feature is
        present during transform (default is to raise). When this is parameter
        is set to 'ignore' and an unknown category is encountered during
        transform, the resulting one-hot encoded columns for this feature
        will be all zeros.
        Ignoring unknown categories is not supported for
        ``encoding='ordinal'``.
    Attributes
    ----------
    categories_ : list of arrays
        The categories of each feature determined during fitting. When
        categories were specified manually, this holds the sorted categories
        (in order corresponding with output of `transform`).
    Examples
    --------
    Given a dataset with three features and two samples, we let the encoder
    find the maximum value per feature and transform the data to a binary
    one-hot encoding.
    >>> from sklearn.preprocessing import CategoricalEncoder
    >>> enc = CategoricalEncoder(handle_unknown='ignore')
    >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
    ... # doctest: +ELLIPSIS
    CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
              encoding='onehot', handle_unknown='ignore')
    >>> enc.transform([[0, 1, 1], [1, 0, 4]]).toarray()
    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.],
           [ 0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.]])
    See also
    --------
    sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
      integer ordinal features. The ``OneHotEncoder assumes`` that input
      features take on values in the range ``[0, max(feature)]`` instead of
      using the unique values.
    sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
      dictionary items (also handles string-valued features).
    sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
      encoding of dictionary items or strings.
    """

    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):
        """Fit the CategoricalEncoder to X.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_feature]
            The data to determine the categories of each feature.
        Returns
        -------
        self
        """

        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):
        """Transform X using one-hot encoding.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to encode.
        Returns
        -------
        X_out : sparse matrix or a 2-d array
            Transformed input.
        """
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out

### Import Data

In [6]:
labeled_data = pd.read_csv("train.csv")
unlabeled_data = pd.read_csv("test.csv")
submit_passengerID = unlabeled_data["PassengerId"]

all_data = [labeled_data, unlabeled_data]

### Analyze the Data

In [7]:
print("Available features:\n", labeled_data.columns.values)
print("")
print("Numerical features:\n", list(labeled_data.select_dtypes(exclude="object").columns))
print("")
print("Categorical features:\n", list(labeled_data.select_dtypes(include="object").columns))

Available features:
 ['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']

Numerical features:
 ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

Categorical features:
 ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [8]:
labeled_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
labeled_data.info()
print("_"*40)
unlabeled_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null

In [10]:
labeled_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
labeled_data.describe(include=["O"])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Asplund, Master. Edvin Rojj Felix",male,347082,G6,S
freq,1,577,7,4,644


In [12]:
labeled_data[["Pclass", "Survived"]].groupby(by="Pclass", as_index=False).mean().sort_values(
    by="Survived", ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [13]:
labeled_data[["Sex", "Survived"]].groupby(by="Sex", as_index=False).mean().sort_values(
    by="Survived", ascending=False)

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [14]:
labeled_data[["SibSp", "Survived"]].groupby(by="SibSp", as_index=False).mean().sort_values(
    by="Survived", ascending=False)

Unnamed: 0,SibSp,Survived
1,1,0.535885
2,2,0.464286
0,0,0.345395
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


In [15]:
labeled_data[["Parch", "Survived"]].groupby(by="Parch", as_index=False).mean().sort_values(
    by="Survived", ascending=False)

Unnamed: 0,Parch,Survived
3,3,0.6
1,1,0.550847
2,2,0.5
0,0,0.343658
5,5,0.2
4,4,0.0
6,6,0.0


In [16]:
pd.pivot_table(labeled_data, values="Survived", index="Embarked", columns="Sex")

Sex,female,male
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,0.876712,0.305263
Q,0.75,0.073171
S,0.689655,0.174603


### Data Cleaning and Preparation

1. Correlating - Quickly correlate features with the predicted label. We can match these quick correlations with the                    modeled correlations later in the project.
2. Completing - Complete any values of features that you plan on keeping in your model.
3. Correcting - Drop any features that are not useful to your model and bin any large outliers.
4. Creating - Feature engineer to create more features.
5. Converting - Convert categorical features to one-hot vectors and dates to dateformat.

In [17]:
print("Before dropping features:", labeled_data.shape, all_data[0].shape, unlabeled_data.shape, \
      all_data[1].shape)

labeled_data = labeled_data.drop(["Ticket", "Cabin"], axis=1)
unlabeled_data = unlabeled_data.drop(["Ticket", "Cabin"], axis=1)
all_data = [labeled_data, unlabeled_data]

print("After dropping features:", labeled_data.shape, all_data[0].shape, unlabeled_data.shape, \
      all_data[1].shape)

Before dropping features: (891, 12) (891, 12) (418, 11) (418, 11)
After dropping features: (891, 10) (891, 10) (418, 9) (418, 9)


In [18]:
for dataset in all_data:
    dataset["Title"] = dataset.Name.str.extract(" ([A-Za-z]+)\.", expand=False)
    
pd.crosstab(labeled_data["Title"], labeled_data["Sex"])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [19]:
for dataset in all_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                                 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
labeled_data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


In [20]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in all_data:
    dataset["Title"] = dataset["Title"].map(title_mapping)
    dataset["Title"] = dataset["Title"].fillna(0)
    
labeled_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1


In [21]:
labeled_data = labeled_data.drop(["PassengerId", "Name"], axis=1)
unlabeled_data = unlabeled_data.drop(["Name"], axis=1)
all_data = [labeled_data, unlabeled_data]
labeled_data.shape, all_data[0].shape, unlabeled_data.shape, all_data[1].shape

((891, 9), (891, 9), (418, 9), (418, 9))

In [22]:
for dataset in all_data:
    dataset["Sex"] = dataset["Sex"].map({"female": 1, "male": 0}).astype(int)
    
labeled_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,S,1
1,1,1,1,38.0,1,0,71.2833,C,3
2,1,3,1,26.0,0,0,7.925,S,2
3,1,1,1,35.0,1,0,53.1,S,3
4,0,3,0,35.0,0,0,8.05,S,1


In [23]:
guess_ages = np.zeros((2,3))
guess_ages

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [24]:
for dataset in all_data:
    for i in range(0,2):
        for j in range(0,3):
            guess_df = dataset[(dataset["Sex"] == i) & \
                              (dataset["Pclass"] == j+1)]["Age"].dropna()
            
            age_guess = guess_df.median()
            
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0,2):
        for j in range(0,3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

labeled_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22,1,0,7.25,S,1
1,1,1,1,38,1,0,71.2833,C,3
2,1,3,1,26,0,0,7.925,S,2
3,1,1,1,35,1,0,53.1,S,3
4,0,3,0,35,0,0,8.05,S,1


In [25]:
labeled_data["Ageband"] = pd.cut(labeled_data["Age"], 5)
labeled_data[["Ageband", "Survived"]].groupby(by="Ageband", as_index=False).mean()

Unnamed: 0,Ageband,Survived
0,"(-0.08, 16.0]",0.55
1,"(16.0, 32.0]",0.337374
2,"(32.0, 48.0]",0.412037
3,"(48.0, 64.0]",0.434783
4,"(64.0, 80.0]",0.090909


In [26]:
for dataset in all_data:
    dataset["FamilySize"] = dataset["SibSp"] + dataset["Parch"] + 1

labeled_data[["FamilySize", "Survived"]].groupby(by="FamilySize").mean().sort_values("Survived", ascending=False)

Unnamed: 0_level_0,Survived
FamilySize,Unnamed: 1_level_1
4,0.724138
3,0.578431
2,0.552795
7,0.333333
1,0.303538
5,0.2
6,0.136364
8,0.0
11,0.0


In [27]:
labeled_data = labeled_data.drop(["SibSp", "Parch", "Ageband"], axis=1)
unlabeled_data = unlabeled_data.drop(["SibSp", "Parch"], axis=1)
all_data = [labeled_data, unlabeled_data]

labeled_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize
0,0,3,0,22,7.25,S,1,2
1,1,1,1,38,71.2833,C,3,2
2,1,3,1,26,7.925,S,2,1
3,1,1,1,35,53.1,S,3,2
4,0,3,0,35,8.05,S,1,1


In [28]:
for dataset in all_data:
    dataset["Age*Pclass"] = dataset["Age"] * dataset["Pclass"]

labeled_data[["Age*Pclass", "Age", "Pclass"]].head(10)

Unnamed: 0,Age*Pclass,Age,Pclass
0,66,22,3
1,38,38,1
2,78,26,3
3,35,35,1
4,105,35,3
5,75,25,3
6,54,54,1
7,6,2,3
8,81,27,3
9,28,14,2


In [29]:
fare_median_Pclass3 = unlabeled_data[unlabeled_data["Pclass"] == 3]["Fare"].median()

In [30]:
unlabeled_data.loc[unlabeled_data["Fare"].isnull(), "Fare"] = fare_median_Pclass3

In [31]:
freq_port = labeled_data.Embarked.dropna().mode()[0]
freq_port

'S'

In [32]:
for dataset in all_data:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
    
labeled_data[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [33]:
labeled_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize,Age*Pclass
0,0,3,0,22,7.25,S,1,2,66
1,1,1,1,38,71.2833,C,3,2,38
2,1,3,1,26,7.925,S,2,1,78
3,1,1,1,35,53.1,S,3,2,35
4,0,3,0,35,8.05,S,1,1,105


In [34]:
unlabeled_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize,Age*Pclass
0,892,3,0,34,7.8292,Q,1,1,102
1,893,3,1,47,7.0,S,3,2,141
2,894,2,0,62,9.6875,Q,1,1,124
3,895,3,0,27,8.6625,S,1,1,81
4,896,3,1,22,12.2875,S,3,3,66


In [35]:
X = labeled_data.drop("Survived", axis=1)
y = labeled_data["Survived"]
unlabeled_data = unlabeled_data.drop("PassengerId", axis=1)
X.shape, y.shape, unlabeled_data.shape

((891, 8), (891,), (418, 8))

In [36]:
def cleaning_Pipeline(data):
    
    # Seperate numerical and cateogrial data
    num_data = list(data.select_dtypes(exclude='object'))
    cat_data = list(data.select_dtypes(include='object'))
    
    # Run numerical pipeline
    num_pipeline = Pipeline([
        ("selector", DataFrameSelector(num_data)),
        ("std_scaler", StandardScaler())
    ])
    
    # Run categorical pipeline
    cat_pipeline = Pipeline([
        ("selector", DataFrameSelector(cat_data)),
        ("cat_encoder", CategoricalEncoder(encoding="onehot-dense"))
    ])
    
    # Combine pipelines
    full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline)
    ])
    
    data_cleaned = full_pipeline.fit_transform(data)
    data_cleaned_df = pd.DataFrame(data_cleaned, columns=(num_data + ["C", "Q", "S"]))
    
    return data_cleaned_df

In [37]:
X_clean = cleaning_Pipeline(X)
unlabeled_data_clean = cleaning_Pipeline(unlabeled_data)

In [38]:
X_clean.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Title,FamilySize,Age*Pclass,C,Q,S
0,0.827377,-0.737695,-0.531048,-0.502445,-0.70755,0.05916,0.111569,0.0,0.0,1.0
1,-1.566107,1.355574,0.670256,0.786845,1.235215,0.05916,-0.781985,1.0,0.0,0.0
2,0.827377,1.355574,-0.230722,-0.488854,0.263832,-0.560975,0.49452,0.0,0.0,1.0
3,-1.566107,1.355574,0.445012,0.42073,1.235215,0.05916,-0.877722,0.0,0.0,1.0
4,0.827377,-0.737695,0.445012,-0.486337,-0.70755,-0.560975,1.356161,0.0,0.0,1.0


In [39]:
unlabeled_data_clean.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Title,FamilySize,Age*Pclass,C,Q,S
0,0.873482,-0.755929,0.362522,-0.497071,-0.73802,-0.553443,1.575103,0.0,1.0,0.0
1,0.873482,1.322876,1.359458,-0.511934,1.252253,0.105643,3.053787,0.0,0.0,1.0
2,-0.315819,-0.755929,2.509769,-0.463762,-0.73802,-0.553443,2.409232,0.0,1.0,0.0
3,0.873482,-0.755929,-0.174289,-0.482135,-0.73802,-0.553443,0.778889,0.0,0.0,1.0
4,0.873482,1.322876,-0.557726,-0.417159,1.252253,0.764728,0.210165,0.0,0.0,1.0


### Split X in Train, Validate, and Test Data

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.10, random_state=42)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

print("Train Data:", X_train.shape, y_train.shape)
print("Validation Data:", X_validation.shape, y_validation.shape)
print("Test Data:", X_test.shape, y_test.shape)

Train Data: (680, 10) (680,)
Validation Data: (121, 10) (121,)
Test Data: (90, 10) (90,)


### Select and Train Model

- Logistic Regression
- KNN or k-Nearest Neighbors
- Support Vector Machines
- Naive Bayes classifier
- Decision Tree
- Random Forrest
- Perceptron
- Artificial neural network
- RVM or Relevance Vector Machine

**Logistic Regression**

In [41]:
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [42]:
log_scores = cross_val_score(log_clf, X_validation, y_validation, scoring="average_precision", cv=5)
display_scores(log_scores)

Scores: [ 0.94882353  0.65845821  0.76415344  0.86228956  0.84363877]
Mean: 0.81547270159
Std. Deviation: 0.0980355107193


In [43]:
log_pred = log_clf.predict(X_test)
classification_Reports(y_test, log_pred)

Classification Report

             precision    recall  f1-score   support

          0       0.88      0.81      0.85        54
          1       0.75      0.83      0.79        36

avg / total       0.83      0.82      0.82        90

Confusion Matrix

[[44 10]
 [ 6 30]]


In [44]:
coeff_df = pd.DataFrame(labeled_data.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(log_clf.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
1,Sex,1.040515
4,Embarked,0.481787
7,Age*Pclass,0.145421
3,Fare,0.092192
6,FamilySize,-0.13768
2,Age,-0.310449
5,Title,-0.486328
0,Pclass,-0.755369


**Decision Tree**

In [45]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [46]:
tree_scores = cross_val_score(tree_clf, X_validation, y_validation, scoring="average_precision", cv=5)
display_scores(tree_scores)

Scores: [ 0.85        0.86111111  0.625       0.81818182  0.63425926]
Mean: 0.75771043771
Std. Deviation: 0.105563349854


In [47]:
tree_pred = tree_clf.predict(X_test)
classification_Reports(y_test, tree_pred)

Classification Report

             precision    recall  f1-score   support

          0       0.84      0.76      0.80        54
          1       0.68      0.78      0.73        36

avg / total       0.78      0.77      0.77        90

Confusion Matrix

[[41 13]
 [ 8 28]]


**Random Forest**

In [48]:
forest_clf = RandomForestClassifier(n_estimators=100)
forest_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [49]:
forest_scores = cross_val_score(forest_clf, X_validation, y_validation, scoring="average_precision", cv=5)
display_scores(forest_scores)

Scores: [ 0.95581197  0.80307422  0.90432099  0.97979798  0.90929533]
Mean: 0.910460097605
Std. Deviation: 0.0607211755658


In [50]:
forest_pred = forest_clf.predict(X_test)
classification_Reports(y_test, forest_pred)

Classification Report

             precision    recall  f1-score   support

          0       0.91      0.89      0.90        54
          1       0.84      0.86      0.85        36

avg / total       0.88      0.88      0.88        90

Confusion Matrix

[[48  6]
 [ 5 31]]


**K-Nearest Neighbors**

In [51]:
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [52]:
knn_scores = cross_val_score(knn_clf, X_validation, y_validation, scoring="average_precision", cv=5)
display_scores(knn_scores)

Scores: [ 0.89113636  0.64351852  0.87654321  0.95555556  0.63194444]
Mean: 0.799739618406
Std. Deviation: 0.134974823894


In [53]:
knn_pred = knn_clf.predict(X_test)
classification_Reports(y_test, knn_pred)

Classification Report

             precision    recall  f1-score   support

          0       0.90      0.83      0.87        54
          1       0.78      0.86      0.82        36

avg / total       0.85      0.84      0.85        90

Confusion Matrix

[[45  9]
 [ 5 31]]


**Support Vector Machine**

In [54]:
svc_clf = SVC()
svc_clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [55]:
svc_scores = cross_val_score(svc_clf, X_validation, y_validation, scoring="average_precision", cv=5)
display_scores(svc_scores)

Scores: [ 0.94347826  0.78940626  0.76900353  0.95356341  0.85770402]
Mean: 0.862631097235
Std. Deviation: 0.0761015428868


In [56]:
svc_pred = svc_clf.predict(X_test)
classification_Reports(y_test, svc_pred)

Classification Report

             precision    recall  f1-score   support

          0       0.89      0.87      0.88        54
          1       0.81      0.83      0.82        36

avg / total       0.86      0.86      0.86        90

Confusion Matrix

[[47  7]
 [ 6 30]]


### Model Evaluation

In [57]:
models = pd.DataFrame({
    "Model": ["Logistic Regression", "Decision Tree", "Random Forest", "K-Nearest Neighbors",
             "Support Vector Machine"],
    "Score": [log_scores.mean(), tree_scores.mean(), forest_scores.mean(), knn_scores.mean(),
              svc_scores.mean()],
    "Std. Deviation": [log_scores.std(), tree_scores.std(), forest_scores.std(), knn_scores.std(),
              svc_scores.std()]})
models.sort_values(by="Score", ascending=False)

Unnamed: 0,Model,Score,Std. Deviation
2,Random Forest,0.91046,0.060721
4,Support Vector Machine,0.862631,0.076102
0,Logistic Regression,0.815473,0.098036
3,K-Nearest Neighbors,0.79974,0.134975
1,Decision Tree,0.75771,0.105563


### Gridsearch Random Forest Hyperparameters

**Random Forest with Hyperparameters**

In [58]:
forest_param_grid = [
    {"n_estimators": [1, 10, 50, 75, 100, 150], "max_features": [1, 2, 4, 6, 8, 10]},
    {"bootstrap": [False], "n_estimators": [1, 10, 50, 75, 100, 150], "max_features": [1, 2, 4, 6, 8, 10]}
]

forest_clf_hyp = RandomForestClassifier()

forest_grid_search = GridSearchCV(forest_clf_hyp, forest_param_grid, cv=5, scoring="average_precision")
forest_grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [1, 10, 50, 75, 100, 150], 'max_features': [1, 2, 4, 6, 8, 10]}, {'bootstrap': [False], 'n_estimators': [1, 10, 50, 75, 100, 150], 'max_features': [1, 2, 4, 6, 8, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='average_precision', verbose=0)

In [59]:
forest_grid_search.best_params_

{'max_features': 6, 'n_estimators': 150}

In [60]:
forest_grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=6, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [61]:
forest_grid_search.best_score_

0.818933471147267

In [62]:
forest_hyp_pred = forest_grid_search.predict(X_validation)

In [63]:
classification_Reports(y_validation, forest_hyp_pred)

Classification Report

             precision    recall  f1-score   support

          0       0.84      0.91      0.87        75
          1       0.82      0.72      0.77        46

avg / total       0.83      0.83      0.83       121

Confusion Matrix

[[68  7]
 [13 33]]


**Support Vector Machine with Hyperparameters**

In [64]:
svc_param_grid = [
    {"C": [0.001, 0.01, 0.1, 1, 10], "kernel": ['linear', 'poly', 'rbf']}
]

svc_clf_hyp = SVC()

svc_grid_search = GridSearchCV(svc_clf_hyp, svc_param_grid, cv=5, scoring="average_precision")
svc_grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear', 'poly', 'rbf']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='average_precision', verbose=0)

In [65]:
svc_grid_search.best_params_

{'C': 1, 'kernel': 'rbf'}

In [66]:
svc_grid_search.best_estimator_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [67]:
svc_grid_search.best_score_

0.8325027536005255

In [68]:
svc_hyp_pred = svc_grid_search.predict(X_validation)

In [69]:
classification_Reports(y_validation, svc_hyp_pred)

Classification Report

             precision    recall  f1-score   support

          0       0.88      0.93      0.90        75
          1       0.88      0.78      0.83        46

avg / total       0.88      0.88      0.87       121

Confusion Matrix

[[70  5]
 [10 36]]


### Predict Unlabeled Data for Submission

In [70]:
final_model = svc_grid_search.best_estimator_

In [71]:
submit_pred = final_model.predict(unlabeled_data_clean)

In [72]:
submit_data = pd.DataFrame({"PassengerId": submit_passengerID, "Survived": submit_pred})

In [75]:
submit_data.to_csv("submission_file.csv", index=False)