In [None]:
import pandas as pd
import numpy as np

# Introduction and EDA

First, read the train data and look at the features

In [None]:
train_data = pd.read_csv("../input/train.csv")
train_data_use = train_data.copy()
print(train_data.shape)
train_data.head(20)

So, as seen most variables are categorical in its core, though most of them are expressed numerically. The only purely numerical vairables/features would be Fare and Age.

In [None]:
train_data[['Age','Fare']].describe()

In [None]:
train_data.info()

It can be seen that there are missing values in Age, Cabin and Embarked

## Name

So here we are going to parse the names in search for titles such as Miss., Mr. and others. After that we will group  them to see if it is interesting to do a full one hot encoding or better to rearrenge them.

In [None]:
import re
reg_exp = r'(?<=(\s))[A-Z][a-z]*(?=\.)'
reg_comp = re.compile(reg_exp)

In [None]:
def title_extracter(string):
    match = re.search(reg_comp, string)
    return match.group(0)

In [None]:
train_data['Title'] = train_data['Name'].apply(lambda x: title_extracter(x))

In [None]:
train_data['Title'].head(5)

In [None]:

train_data.groupby('Title').apply(lambda x: (x['Survived'].sum()/x['Survived'].count(),x['Survived'].count()))


Given these results we have decided to divide the names into three categories: high class men, men and women

In [None]:
def name_grouper(name):
    if name in ['Dr', 'Major','Master','Sir', 'Col']:
        return 0
    elif name in ['Capt','Don', 'Jonkheer','Mr','Rev']:
        return 1
    else:
        return 2

In [None]:
train_data['Title_enc'] = train_data['Title'].apply(lambda x: name_grouper(x))

In [None]:
train_data['Title_enc'].head(5)

## Siblings or Spouse

In the case of SIbSp what we have as values?


In [None]:
print(np.unique(train_data['SibSp']))

Let's see if all the values are representative towards surviving or not.

In [None]:
train_data.groupby('SibSp').apply(lambda x: (x['Survived'].sum()/x['Survived'].count(), x['Survived'].count()))

We areg oing to have 3 categories based on the percentage of survivors and its total number: 0, [1,2], and the rest

In [None]:
def sibsp_grouper(number):
    if number in [3,4,5,8]:
        return 0
    elif number in [1,2]:
        return 1
    else:
        return 2

In [None]:
train_data['SibSp_enc'] = train_data['SibSp'].apply(lambda x: sibsp_grouper(x))

## Parch

Same strategy for PArch

In [None]:
print(np.unique(train_data['Parch']))

In [None]:
train_data.groupby('Parch').apply(lambda x: (x['Survived'].sum()/x['Survived'].count(), x['Survived'].count()))

We group people together in different groups

In [None]:
def parch_grouper(number):
    if number in [4,5,6]:
        return 0
    elif number in [1,2,3]:
        return 1
    else:
        return 2

In [None]:
train_data['Parch_enc'] = train_data['Parch'].apply(lambda x: parch_grouper(x))

## Preprocessing

Let's build a full processing pipeline

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
class column_selector(TransformerMixin):
    def __init__(self,attributes):
        self.attributes = attributes
    def fit(self,X,y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attributes]

In [None]:
class data_transformations(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['Title'] = X['Name'].apply(lambda x: title_extracter(x))
        X['Title_enc'] = X['Title'].apply(lambda x: name_grouper(x))
        X['SibSp_enc'] = X['SibSp'].apply(lambda x: sibsp_grouper(x))
        X['Parch_enc'] = X['Parch'].apply(lambda x: parch_grouper(x))
        return X

And this imputer taken from https://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn

In [None]:
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [None]:
num_attributes = ['Age','Fare']
strategy = 'median'

num_pipeline = Pipeline([
    ('selector', column_selector(num_attributes)),
    ('imputer', Imputer(strategy = strategy)),
    ('scaler', StandardScaler())
])

In [None]:
num_pipeline.fit_transform(train_data)

And now the categorical pipeline

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
cat_attributes = ['Pclass', 'Sex','SibSp_enc', 'Parch_enc', 'Embarked', 'Title_enc']

cat_pipeline = Pipeline([
    ('selector', column_selector(cat_attributes)),
    ('imputer', DataFrameImputer()),
    ('encoder', OneHotEncoder())
])

In [None]:
cat_pipeline.fit_transform(train_data)

In [None]:
glob_pipe = Pipeline([
    ('adder', data_transformations())
])

In [None]:
trans_data = glob_pipe.transform(train_data_use)

In [None]:
from sklearn.pipeline import FeatureUnion

And finally the full pipeline

In [None]:
total_pipeline = FeatureUnion([
    ('num_pipe', num_pipeline),
    ('cat_pipe', cat_pipeline)
])

In [None]:
train_trans = total_pipeline.fit_transform(trans_data)

# Models

We are going to try different models: logistic regression, random forest and support vector classifier

In [None]:
y_train = train_data['Survived']

In [None]:
from sklearn.model_selection import cross_val_score

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
l_r = LogisticRegression(solver='lbfgs')
scores = cross_val_score(l_r, train_trans, y_train, cv=5, n_jobs = -1, verbose=3, scoring='accuracy')
print(np.mean(scores), np.std(scores))

##  Random Forests

In [None]:
from sklearn.ensemble import  RandomForestClassifier

In [None]:
r_f = RandomForestClassifier()
scores = cross_val_score(r_f, train_trans, y_train, cv=5, n_jobs=-1, verbose=3, scoring="accuracy")
print(np.mean(scores), np.std(scores))

## SVM classifier

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC(gamma='scale', probability=True)
scores = cross_val_score(svc, train_trans, y_train, cv=5, n_jobs=-1, verbose=3, scoring="accuracy")
print(np.mean(scores), np.std(scores))

# Creating a mixed classifier

To mix the results from all the classifiers together as an ensemble we use them together as a voting classifier scheme. In our case we have used soft voting which means the probabilities given by each classifier are weighted by the importance of each classifier. We could have used hard voting which assings the most frequest class for the sample among the classifiers.

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
v_c = VotingClassifier(estimators=[('l_r', l_r), ('r_f', r_f),  ('svc',svc)], voting='soft', weights=[2,1,2]) 

In [None]:
l_r.fit(train_trans, y_train)
r_f.fit(train_trans, y_train)
svc.fit(train_trans, y_train)

In [None]:
v_c.fit(train_trans, y_train)

In [None]:
scores = cross_val_score(v_c, train_trans, y_train, cv=5, n_jobs=-1, verbose=3, scoring="accuracy")
print(np.mean(scores), np.std(scores))

# Grid Search on Voting Classifier

And finally a lit bit of tunning with a grid search.

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
l_r = LogisticRegression(solver='liblinear', random_state=42)
r_f = RandomForestClassifier(random_state=42)
svc = SVC(probability=True)
v_c = VotingClassifier(estimators=[('l_r', l_r), ('r_f', r_f),  ('svc',svc)], voting='soft', weights=[2,1,2]) 

In [None]:
param_grid = [
    {'l_r__C':np.arange(1,10,1), 'l_r__penalty':['l1','l2'], 'r_f__n_estimators':np.arange(1,30,5),
    'r_f__max_features': np.arange(2,len(np.asarray(train_trans.todense())[0]),2), 'svc__kernel':['rbf','poly','sigmoid'],
    'svc__C':np.arange(1,10,1)}
]

In [None]:
grid_vot = GridSearchCV(v_c, param_grid=param_grid, cv=5,scoring='accuracy',n_jobs=-1, refit=True, verbose=3)

In [None]:
grid_vot.fit(train_trans, y_train)

In [None]:
grid_vot.best_params_

In [None]:
best_v_c = grid_vot.best_estimator_
grid_vot.best_estimator_

In [None]:
grid_vot.best_score_

In [None]:
from sklearn.externals import joblib

In [None]:
name = "best_model.sav"
joblib.dump(best_v_c, name)

In [None]:
best_v_c = joblib.load(name)

# Results on test set

In [None]:
test_data = pd.read_csv("../input/test.csv")

In [None]:
trans_test = glob_pipe.transform(test_data)

In [None]:
final_test = total_pipeline.fit_transform(trans_test)

In [None]:
y_test_pred = best_v_c.predict(final_test)

In [None]:
pred_file = pd.DataFrame({'PassengerId':test_data['PassengerId'], 'Survived':y_test_pred})
pred_file.head(10)

In [None]:
pred_file.to_csv('submission.csv', index=False)