<p align="center"><h1 align="center">Titanic Dataset Classification Tutorial</h1>


---



## **(1) Preprocessor Function & Setup**

> ### A more advanced example demonstrating the flexibility of a new *Column Transformer* approach.

In [5]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

# Read data from Titanic dataset.
titanic_url = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data = pd.read_csv(titanic_url)

# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.

print(data.shape)

data.head()

(1309, 14)


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [6]:
# Target = survived
y = data['survived']
y = y.map({0: 'died', 1: 'survived'})

# Dropping some variables we have decided not to use for our analysis
X = data.drop(['survived','sibsp','parch','ticket','name','cabin','boat','body','home.dest'], axis=1)


#Train test splitting our data with 20% of observations in test data and 80% in training data:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



## Objective: Preprocess columns into data that will be used to fit ML model
Common preprocessing steps include...
1) Standardizing numeric variables using z-score transformations or min max transformations
2) Imputing missing values
3) One hot encoding categorical variables


In [8]:
# Preprocess data using sklearn's Column Transformer approach

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']

# Replacing missing values with Modal value and then one-hot encoding.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))]) # If model creates perfect multicollinearity, 
                                                         #use OneHotEncoder(drop='first',handle_unknown='error') to drop category

# Final preprocessor object set up with ColumnTransformer...

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


# fit preprocessor to your data
preprocess = preprocess.fit(X_train)

In [10]:
preprocess.transform(X_test)

array([[ 0.66511788, -0.50535342,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.68870978, -0.24898038,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.98366557, -0.13159525,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.02802251, -0.40549389,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.29052517, -0.40549389,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.13125133, -0.50233662,  0.        , ...,  0.        ,
         0.        ,  1.        ]])

In [None]:
# Write function to transform data with preprocessor

def preprocessor(data):
    preprocessed_data=preprocess.transform(data)
    return preprocessed_data

In [None]:
print(X_train.shape)
X_train

(1047, 5)


Unnamed: 0,pclass,sex,age,fare,embarked
1118,3,male,25.0000,7.9250,S
44,1,female,41.0000,134.5000,C
1072,3,male,,7.7333,Q
1130,3,female,18.0000,7.7750,S
574,2,male,29.0000,21.0000,S
...,...,...,...,...,...
763,3,female,0.1667,20.5750,S
835,3,male,,8.0500,S
1216,3,female,,7.7333,Q
559,2,female,20.0000,36.7500,S


In [None]:
# Notice categorical feature columns have been one-hot encoded
print(preprocess.transform(X_train).shape)
preprocess.transform(X_train)

(1047, 10)


array([[-0.37016209, -0.50478215,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.90402864,  1.97155505,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.13125133, -0.5085326 ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.13125133, -0.5085326 ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.7683467 ,  0.05915559,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.18729636, -0.35658342,  0.        , ...,  0.        ,
         0.        ,  1.        ]])

## **(2) Build Your Model Using `sklearn`**

In [None]:
print(X_train.shape, X_test.shape, 
      y_train.shape, y_test.shape)

(1047, 5) (262, 5) (1047,) (262,)


In [None]:
# Penalized Logit...

hyperparameters = {'C':np.logspace(1, 10, 100), 'penalty':['l2']}

logit = LogisticRegression()
logit_cv = GridSearchCV(logit, hyperparameters, cv = 10)
logit_cv.fit(preprocess.transform(X_train), y_train)

print("Best Parameters {:.3f}: ", logit_cv.best_params_)

Best Parameters {:.3f}:  {'C': 10.0, 'penalty': 'l2'}


In [None]:
logit_cv.best_estimator_

LogisticRegression(C=10.0)

In [None]:
model = LogisticRegression(C=10, penalty='l2')

model.fit(preprocessor(X_train), y_train) # Fitting to the training set.

model.score(preprocessor(X_train), y_train) # Fit score, 0-1 scale.

0.7793696275071633

In [None]:
model.score(preprocessor(X_test),y_test)


0.7900763358778626