# Titanic Survivors Demo + Exercise
* adapted from: https://blog.socialcops.com/engineering/machine-learning-python/

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (15.0, 8.0)

In [None]:
# we can use CSV if xlrd is not installed
data = pd.read_excel('data/titanic3.xls', 'titanic3', index_col=None, na_values=['NA'])

## Legend
* pclass = Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
* survival (0 = No; 1 = Yes)
* sibsp = number of Siblings/Spouses Aboard
* parch = number of Parents/Children Aboard
* ticket = ticket Number
* embarked (from...C = Cherbourg; Q = Queenstown; S = Southampton)
* boat = Lifeboat ID
* body = ID Number
* home.dest = Home/Destination

In [None]:
data.head()

## Feature selection: We only want to keep features which are predictive of the target
* these columns are unlikely to be meaningful and have several missing values so we'll drop them: boat, ticket, cabin, body

In [None]:
data = data.drop(columns=['boat', 'ticket', 'cabin', 'body'])

## We can get a quick summary of how many people survived

In [None]:
data['survived'].value_counts()

## Because we encode survival as '1', finding the mean gives us survival percentage

In [None]:
data['survived'].mean()

## If we group by passenger class and compute the mean, what does this reveal?

In [None]:
data.groupby('pclass').mean()

## .groupby() essentially creates a bunch of dataframes in which the all the columns match the various values of the grouping variables...
* we can see this by looking at the __`groups`__ attribute
* each grouping contains the rows of the dataframe
* so in the example below, rows 0, 2, 4, 6, 8, etc. are the females in 1st class

In [None]:
from pprint import pprint # "pretty printer"
pprint(data.groupby(['pclass', 'sex']).groups)

## If we group by class and sex and compute the mean, what does this tell us about the "Women" part of "Women and children first?"

In [None]:
class_sex_grouping = data.groupby(['pclass', 'sex']).mean()
class_sex_grouping

## Let's render it as a bar chart to make it clear...

In [None]:
class_sex_grouping['survived'].plot.bar();

## If we group by age, we can investigate the "Children" part of "Women and children first"

In [None]:
group_by_age = pd.cut(data.age, np.arange(0, 90, 10))
age_grouping = data.groupby(group_by_age).mean()
age_grouping['survived'].plot.bar();

In [None]:
group_by_age.value_counts()

## A lot of missing data...

In [None]:
data.count()

## We can fill in the missing age values with averages
* Is this a good strategy for this data?
* What could we exploit to make a better go of it? (Hint: Look at the class breakdowns above)

In [None]:
data = data.fillna(data.mean())

## Now we are basically missing home/destination data and a few embarked results

In [None]:
data.count()

## With only two results missing embarked data, we can probably just drop those samples

In [None]:
data[data['embarked'].isnull()]

## There are a substantial number of missing home/destination values
* We don't know if that is an important feature, but we'd lose a lot of data if we throw those samples away, so let's fill it in with placeholders for now

In [None]:
data["home.dest"] = data["home.dest"].fillna("NA")

## Now with only the two embarked rows missing, we are in good shape, so let's drop those and then move on

In [None]:
data.count()

In [None]:
data = data.dropna()

In [None]:
data.count()

## We turn categorical data (__`sex`__ and __`embarked`__) into numbers values using a Sci-Kit Learn __`LabelEncoder`__
* or we can use the __`.get_dummies()`__ function, but __`LabelEncoder`__ will be cleaner here

In [None]:
from sklearn import datasets, svm, model_selection, tree, preprocessing, metrics

In [None]:
def preprocess_titanic_df(df):
    processed_df = df.copy() # make a copy of the dataframe
    le = preprocessing.LabelEncoder() # create a label encoder
    
    processed_df.sex = le.fit_transform(processed_df.sex) # female = 0, male = 1
    processed_df.embarked = le.fit_transform(processed_df.embarked) 
    processed_df = processed_df.drop(columns='name home.dest'.split())
    return processed_df

## Preprocess our data, then verify that everything is suitable for an ML activity

In [None]:
processed_df = preprocess_titanic_df(data)
processed_df.head()

## Now you take over...
* Split the data into training and test data
* Use a __`DecisionTreeClassifier`__ to predict whether each passenger surived
* Score your classifier
* Visualize the tree, like we did in the previous demo