# Titanic ML

## Loading libraries

In [None]:
import numpy as np
import random
import pandas as pd
import tensorflow as tf
import shutil
from google.datalab.ml import TensorBoard
print(tf.__version__)

## Loadind the data

In [None]:
df_input = pd.read_csv('titanic_data.csv')
df_input.head()

* survival: Survival (0 = No; 1 = Yes)
* class: Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
* name: Name
* sex: Sex
* age: Age
* sibsp: Number of Siblings/Spouses Aboard
* parch: Number of Parents/Children Aboard
* ticket: Ticket Number
* fare: Passenger Fare
* cabin: Cabin
* embarked: Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

## Exploratory Data Analysis (EDA)

In [None]:
# Total of rows
print(len(df_input))

# How many of them survived?
print(df_input.Survived.sum())

# How many of them did not survive?
print(df_input.Survived.mean())

In [None]:
# % of survivors by gender
df_input.groupby(['Sex'])['Survived'].mean()

In [None]:
# % of survivors by gender
df_input.groupby(['Sex']).agg({'PassengerId': 'count', 'Survived':['sum', 'mean']})

In [None]:
# % of survivors by gender and class
df_input.groupby(['Sex', 'Pclass']).agg({'PassengerId': 'count', 'Survived':['sum', 'mean']})

In [None]:
# Histogram for age distribution
df_input.Age.plot.hist(bins = 20)

## Tensorflow

**How is a model trained?**

    - Weights
    - Loss function

### Spliting dataset

In [None]:
np.random.seed(1) 

# Spliting into train and test
msk = np.random.rand(len(df_input)) < 0.8
print(msk[1:10])
print("Total True values: " + str(sum(msk)))
print("Total number of rows: " + str(len(msk)))
print("Rate of True values: " + str(1.0*sum(msk)/len(msk)))

In [None]:
# Train
df_train = df_input[msk]

# Evaluation
df_evaluation = df_input[~msk]

print("Train sample size is " + str(len(df_train)) + " and Evaluation sample size is: " + str(len(df_evaluation)))
print("Together: " + str(len(df_train) + len(df_evaluation))) 

### Sanity check

In [None]:
print(df_train.Survived.mean())
print(df_evaluation.Survived.mean())

### Train and eval input functions to read from Pandas Dataframe

In [163]:
def make_train_input_fn(df, num_epochs, columns):
    return tf.estimator.inputs.pandas_input_fn(
        x = df[columns],
        y = df[columns[0]],
        batch_size = 40,
        num_epochs = num_epochs,
        shuffle = True,
        queue_capacity = 100
  )

In [164]:
def make_eval_input_fn(df, columns):
  # When we evaluate the model, we do it just in one epoch. This is important
  return tf.estimator.inputs.pandas_input_fn(
    x = df[columns],
    y = df[columns[0]],
    batch_size = 40,
    shuffle = False,
    queue_capacity = 100
  )

In [165]:
def make_prediction_input_fn(df):
    return tf.estimator.inputs.pandas_input_fn(
        x = df,
        y = None,
        batch_size = 128,
        shuffle = False,
        queue_capacity = 100
    )

### Linear Classifier with tf.Estimator framework 

In [166]:
# Setting the output directory
OUTDIR = './titanic_trained'

# Target column name
target = 'Survived'

In [167]:
# Features in a "tensorflow way"
features = ['Sex']
cols =  [target] + features

# This is for Sex
features_tensorflow = [
    tf.feature_column.categorical_column_with_vocabulary_list(
        key = features[0],
        vocabulary_list = ['female', 'male']
    )
]

### Training process

In [None]:
tf.logging.set_verbosity(tf.logging.INFO)

# start fresh each time
shutil.rmtree(
  OUTDIR, 
  ignore_errors = True
) 

model = tf.estimator.LinearClassifier(
  feature_columns = features_tensorflow, 
  model_dir = OUTDIR
)

model.train(
  input_fn = make_train_input_fn(
    df = df_train, 
    num_epochs = 10,
    columns = cols
  )
)

### Reporting loss

**Why do we need an evaluation dataset?**
    
    - Model can memorise trainig data. It is not escalable?

In [77]:
def print_crossentropy(model, df, columns):
    metrics = model.evaluate(
        input_fn = make_eval_input_fn(
            df = df,
            columns = columns
        )
    )
    print('Cross entropy on dataset = {}'.format(metrics['average_loss']))

In [None]:
print_crossentropy(model, df_evaluation, cols)

### Improving loss function: Adding more features

In [None]:
df_train.loc[:,'Pclass_mod'] = df_train['Pclass'] - 1
df_train.head()

In [None]:
df_evaluation.loc[:, 'Pclass_mod'] = df_evaluation['Pclass'] - 1
df_evaluation.head()

In [None]:
print("Missing values for Age: " + str(sum(df_train.Age.isnull())))
df_train.Age.fillna(df_train.Age.mean(), inplace=True)
print("Missing values for Age after replacing: " + str(sum(df_train.Age.isnull())))

In [112]:
df_evaluation.Age.fillna(df_train.Age.mean(), inplace=True)

In [113]:
# Features in a "tensorflow way"
features = ['Sex', 'Pclass_mod', 'Age']
cols =  [target] + features

features_tensorflow = [
    # This is for Sex
    tf.feature_column.categorical_column_with_vocabulary_list(
        key = features[0],
        vocabulary_list = ['female', 'male']
    ),
    
    # This is for Pclass_mod
    tf.feature_column.categorical_column_with_identity(
        key = features[1],
        num_buckets = 3
    ),
    
    # This is for age
    tf.feature_column.numeric_column(
        key = features[2]
    )
]

In [None]:
tf.logging.set_verbosity(tf.logging.INFO)

# start fresh each time
shutil.rmtree(
  OUTDIR, 
  ignore_errors = True
) 

model = tf.estimator.LinearClassifier(
  feature_columns = features_tensorflow, 
  model_dir = OUTDIR
)

model.train(
  input_fn = make_train_input_fn(
    df = df_train, 
    num_epochs = 10,
    columns = cols
  )
)

In [None]:
print_crossentropy(model, df_evaluation, cols)

### Improving loss function: Feature engineering

In [None]:
df_train.loc[:, 'classSex'] = df_train['Pclass'].astype(str) + df_train['Sex']
df_evaluation.loc[:, 'classSex'] = df_evaluation['Pclass'].astype(str) + df_evaluation['Sex']
df_train.head()

In [122]:
# Features in a "tensorflow way"
features = ['classSex', 'Age']
cols =  [target] + features

features_tensorflow = [
    # This is for classSex
    tf.feature_column.categorical_column_with_vocabulary_list(
        key = features[0],
        vocabulary_list = ['1female', '2female', '3female', '1male', '2male', '3male']
    ),
    
    
    # This is for age
    tf.feature_column.numeric_column(
        key = features[1]
    )
]

In [None]:
tf.logging.set_verbosity(tf.logging.INFO)

# start fresh each time
shutil.rmtree(
  OUTDIR, 
  ignore_errors = True
) 

model = tf.estimator.LinearClassifier(
  feature_columns = features_tensorflow, 
  model_dir = OUTDIR
)

model.train(
  input_fn = make_train_input_fn(
    df = df_train, 
    num_epochs = 10,
    columns = cols
  )
)

In [None]:
print_crossentropy(model, df_evaluation, cols)

## More metrics
    - Why is reporting loss function a bad idea?? Business does not care about it since loss does not contain any business meaning.

### Accuracy

**How is it calculated?**

$\frac{TP+TN}{Total}$

In [127]:
# Computing accuracy
def print_accuracy(model, df, columns):
    metrics = model.evaluate(
        input_fn = make_eval_input_fn(
            df = df,
            columns = columns
        )
    )
    print('Accuracy on dataset = {}'.format(metrics['accuracy']))


In [None]:
print_accuracy(model, df_evaluation, cols)

**What's wrong with this metric?**
    - It is threshold dependant.
    - It does not work with unbalanced datasets


### Precission

**How is it calculated?**

$\frac{TP}{\mbox{Predicted positive}}$

In [129]:
# Computing accuracy
def print_precision(model, df, columns):
    metrics = model.evaluate(
        input_fn = make_eval_input_fn(
            df = df,
            columns = columns
        )
    )
    print('Precision on dataset = {}'.format(metrics['precision']))

In [None]:
print_precision(model, df_evaluation, cols)

**What's wrong with this metric?**
- It is threshold dependant.
- Model can underpredict positive class

### Recall

**How is it calculated?**

$\frac{TP}{P}$

In [132]:
# Computing recall
def print_recall(model, df, columns):
    metrics = model.evaluate(
        input_fn = make_eval_input_fn(
            df = df,
            columns = columns
        )
    )
    print('Recall on dataset = {}'.format(metrics['recall']))

In [None]:
print_recall(model, df_evaluation, cols)

**What's wrong with this metric?**
- It is threshold dependant.
- Model can overpredict positive class

### AUC

*The AUC is the probability the model will score a randomly chosen positive class higher than a randomly chosen negative class.*

In [136]:
# Computing AUC
def print_auc(model, df, columns):
    metrics = model.evaluate(
        input_fn = make_eval_input_fn(
            df = df,
            columns = columns
        )
    )
    print('AUC on dataset = {}'.format(metrics['auc']))

In [None]:
print_auc(model, df_evaluation, cols)

## Tensorboard

In [None]:
TensorBoard().start(OUTDIR)

In [None]:
pids_df = TensorBoard.list()
if not pids_df.empty:
    for pid in pids_df['pid']:
        TensorBoard().stop(pid)
        print('Stopped TensorBoard with pid {}'.format(pid))