In [1]:

import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv(
  filepath_or_buffer='https://raw.githubusercontent.com/aaronmcdaid/P2---Data-Analytics-With-Python/master/Berlin/WS%202019/5.%20Datasets/titanic_dataset/train.csv',
)

In [0]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [0]:
# Just a quick overview of the dataset (ideally we'd do a lot more here, but for simplicity's sake)
df.describe(include='all').loc['count']

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
Name: count, dtype: object

In [0]:
df.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,891.0,891.0,891.0,891.0,891.0,891,889
unique,,,,891,2,,,,681.0,,148,3
top,,,,"Sage, Miss. Stella Anna",male,,,,1601.0,,unknown,S
freq,,,,1,577,,,,7.0,,687,644
mean,446.0,0.383838,2.308642,,,29.361582,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,13.019697,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,22.0,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,35.0,1.0,0.0,,31.0,,


In [0]:
# Seems like both Age, Cabin and Embarked have missing data
# We'll just fill the nulls with what we call "uninformative priors"
# i.e. we have no clue, so we want those numbers to  not bias the model

# For a numerical feature, that is the median

df.Age = df.Age.fillna(
    df.Age.median()
)

# For a categorical, that is the mode (most common category)
df.Embarked = df.Embarked.fillna(
    df.Embarked.mode()
)

# Finally, for the cabin we have no clue at all so we call it unknown so it doesn't bug us
# We won't use this column for much anyway

df.Cabin = df.Cabin.fillna("unknown")

# Welcome to Machine Learning!

Today we will go over a basic (basic!) Machine Learning pipeline from dataset to prediction.

We have the (in)famous Titanic dataset, and with that, we want you to try to predict who made it home from the wreckage, basically, who `Survived == True`

## How this class will be structured

We will start from the very basic, and then start building a pipeline incrementally. There's only one rule: After the first time we run the model, every cell must produce a prediction.

Why? Because this is how we (PM's, the annoying people in your team) will push you to work. Deliver a working (not perfect) model every time, and grow incrementally.

Here's what we'll do:

1. Short theoretical introduction (trust me, you'll need it)
1. Create, train and validate our first model step by step
1. Evaluate the results, how do we feel about them?
1. Enhancement: Feature engineering! and re-run




## Create, train and validate our first machine learning model

### First, we need a model to train

Models have different applications, and as you grow into the field you'll understand them better.

For this example, we will use a very generic and powerful classifier: the random forest.

For any model you usually have to define a few parameters. Since they will be constant throughout most of the training process, we call them **Hyperparameters**.



In [0]:

model = RandomForestClassifier(
    n_estimators=100, # Hyperparameter 1
    max_depth=2,      # Hyperparameter 2
    random_state=0    
)

In [0]:
model # As you can see, there's many more hyperparameters that were defined for us
      # While it helps to understand what all of those do, you can get very far with predefined ones

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

Do you see any reference to df anywhere in that code?

### Second, we need data to train it on

Scikit-Learn has a standard interface for all models

```
model.fit(X, y)
```

Will take a generic model and some training data and return a trained model. A trained model can do predicitions that will (maybe) make sense.

X is a dataset of **FEATURES**, the data that we will use to predict stuff

y is a single column of **TARGETS**, the correct answers we want the model to learn how to generate

Sice this interface is predefined and we cannot really change it, we will need to take X and y out of our dataset

In [0]:
# For our first iteration, we will try to predict survival based on four variables:
predictors = [
    'SibSp',
    'Parch',
]

# One last thing (technicality), let's drop all null rows for now

X = df[predictors]

y = df['Survived']


In [0]:
X.head()

Unnamed: 0,SibSp,Parch
0,1,0
1,1,0
2,0,0
3,1,0
4,0,0


In [0]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

### Now to the magic, let's train the model

In [0]:
trained_model = model.fit(X,y)

In [0]:
trained_model # As we can see, nothign seems to have changed

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [0]:
# But if we give it a dataset with some input values for the predictors, it can predict  if they survived or not!
trained_model.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,

In [0]:
# And that is our prediction... uhm, okay?

### How do we know this worked?

We know who survived (we have that in the variable called `y`, remember?) so we can just check how many hits it had

In [0]:
y_predict = trained_model.predict(X)

In [0]:
(y_predict == y).value_counts()

True     596
False    295
Name: Survived, dtype: int64

In [0]:
596/(596+295)

0.6689113355780022

Do you trust this to be a good metric for whether the model is predicting well?

Remember we trained it to return those exact values (y) for that exact input (X), so if the model was big enough it could theoretically match everything perfectly.

See:

In [0]:
model_huge = RandomForestClassifier(n_estimators=10000, max_depth=200)
(model_huge.fit(X, y).predict(X) == y).value_counts()


True     604
False    287
Name: Survived, dtype: int64

If we keep making it bigger it will probably fit the data even better (and also take a handful of hours to run so trust me here)

So a huge model solves all of our problems!? Right!?





**OF COURSE NOT :D**

### So how do we actually know this thing works?

To verify that a model is accurately predicting y from X in the phenomenon that you're trying to explain, you need to use a *train-test split*

You use a subset of the data for training the model, and another one to verify it got it right.

## Let's start again from the data extraction part



This time we'll get it right

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=111) 

# A few things happening here in case you're curious:

  # WTF is this comma-separated assignnment?
    # The function will return a tuple with the train/test splits of X and y like
    # (X_train, X_test, y_train, y_test), when we put them separated by commas in
    # this way we are "unpacking the tuple" into separate variables

  # WTF is this random_state thing?
    # This function will randomise the dataset and split it into train and test
    # We want to make this cell produce the same output every time
    # This number is the "seed" for the randomiser, and as it is always the same
    # the order is random but it is always the same
  


#### Now back to training and validating

In [0]:
model = RandomForestClassifier(
    n_estimators=1000, # Hyperparameter 1
    max_depth=2,      # Hyperparameter 2
    random_state=0
)

traied_model = model.fit(X_train, y_train)

test_predictions = trained_model.predict(X_test)

(test_predictions == y_test).value_counts()

True     146
False     77
Name: Survived, dtype: int64

In [0]:
146 / (146+77) # This gives our model an accuracy (on this very simple iteration) of 65%

0.6547085201793722

In [0]:
# Is that good? Let's use the Monkey Predictor Model and see if it performs better

In [0]:
from numpy.random import random_integers

def monkey_model_predict(X):
  return random_integers(0,1,X.shape[0])
(monkey_model_predict(X_test) == y_test).value_counts()

  after removing the cwd from sys.path.


True     116
False    107
Name: Survived, dtype: int64

In [0]:
116/(116+107)

0.5201793721973094

In a classification problem, your target accuracy is to significantly beat random chance.

If a monkey tapping on buttons randomly can better predict your target than your model, you just wasted your time :)

BTW, this happens a lot.

## Let's make this model better!

Now we know how to build and test a model (veeeeery simply). Now let's make it better.

Where should we start?

- Get a more powerful model?
- Get better data?

For that, we need to think like analysts:
- Is the data that I'm giving it likely to explain what I want to predict?
  - Think like a human (job security!): Is the number of siblings or parents likely to predict your survival?
  - The answer is probably no, so let's explore the dataset a bit more to find any good predictors

In [0]:
# Do that here

In [0]:
from sklearn.metrics import accuracy_score

predictors = [
    "Fare",
    "Age"
]

X = df[predictors]

y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    random_state=111
    
)

model = RandomForestClassifier(
    n_estimators=1000, # Hyperparameter 1
    max_depth=2,      # Hyperparameter 2
    random_state=0
)

trained_model = model.fit(X_train, y_train)

test_predictions = trained_model.predict(X_test)

accuracy_score(y_test, test_predictions)

0.6604477611940298

### Why can I not use categorical variables?

The model that we're using (and actually most scikit-learn models) only work with numerical data. But worry not! We can use all variables with a bit of pre-processing ;)

In [0]:
df_nonull.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [0]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

df['is_child'] = df.Age < 16

categorical_predictors = [
    'Sex',
    'is_child',
    'Pclass'
    
]


numeric_predictors = [
    'SibSp',
    'Parch',
    'Age',
    'Fare'
]

categorical_encoder = OneHotEncoder(
    sparse=False
)

X_not_transformed = df[categorical_predictors]

X_cat = categorical_encoder.fit_transform(
    X_not_transformed
) # This is now going to be a numpy array, as scikit-learn will strip out all the indexing magic

X_num = df[numeric_predictors].values

X = np.concatenate([
    X_cat,
    X_num
], axis=1)

y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    random_state=11,
    test_size=0.3
)

RandomForestClassifier(
    n_estimators=1000, # Hyperparameter 1
    max_depth=2,      # Hyperparameter 2
    random_state=1
)\
  .fit(X_train, y_train)\
  .score(X_test, y_test)

0.8246268656716418

In [2]:
! pip install juypterthemes


Collecting juypterthemes


  ERROR: Could not find a version that satisfies the requirement juypterthemes (from versions: none)
ERROR: No matching distribution found for juypterthemes


In [None]:
! jt -t<