# Exercises chapter 3

In [1]:
import sklearn 
import numpy as np
import pandas as pd
import os

np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

# Helper function to save the images
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

1. Build a MINST classifier with 97% accuracy on test set 
Hint: use `KNeighborsClassifier` and do grid search over hyperparameters (`weights` and `n_neighbors`)

In [2]:
# Importing the famous MINST dataset
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)

In [4]:
# Pre-processing + splitting into test and train
X, y = mnist["data"], mnist["target"]
y = y.astype(np.uint8) #converting response variable to integer
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [6]:
# Instantiating the KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
my_knn = KNeighborsClassifier()

In [8]:
# Grid Search over hyperparameters (weights and n_neighbors)
from sklearn.model_selection import GridSearchCV

param_grid_knn = {
    'weights': ('uniform', 'distance'),
    'n_neighbors': list(range(1, 10))
}

In [9]:
grid = GridSearchCV(my_knn, param_grid_knn, cv = 5, scoring = 'accuracy')
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                         'weights': ('uniform', 'distance')},
             scoring='accuracy')

In [10]:
# examine the best model
# Source of the code: https://www.ritchieng.com/machine-learning-efficiently-search-tuning-param/ 
# Single best score achieved across all params (k)
print(grid.best_score_)

# Dictionary containing the parameters (k) used to generate that score
print(grid.best_params_)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(grid.best_estimator_)

0.9716166666666666
{'n_neighbors': 4, 'weights': 'distance'}
KNeighborsClassifier(n_neighbors=4, weights='distance')


In [11]:
# Instantiating and fitting the model with the best parameters
optimal_knn = KNeighborsClassifier(
    n_neighbors=4, 
    weights='distance'
)

optimal_knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=4, weights='distance')

In [12]:
# Evaluate *accuracy* on test data
y_knn_pred = optimal_knn.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_knn_pred)

0.9714

2. Writing function that shifts images, then using it to augment the dataset and train the model on the augmented data

In [27]:
from scipy.ndimage import shift

#Example:
#shift(image, [2, 1], cval=0) shifts the image 2 pixels down and 1 pixel to the right

# Creating the function
def shift_image(image, direction):    
    if direction == 'left':
        shifting_arr = [0, -1]
    elif direction == 'right':
        shifting_arr = [0, 1]
    elif direction == 'up':
        shifting_arr = [-1, 0]
    elif direction == 'down':
        shifting_arr = [1, 0]
        
    return(shift(image.reshape(28, 28), shifting_arr, cval=0))

# How can I access an image?? Look for an example
# X_train[0]

In [28]:
# Augmenting the dataset
x_up = np.array([shift_image(x, direction='up') for x in X_train])
x_down = np.array([shift_image(x, direction='down') for x in X_train])
x_right = np.array([shift_image(x, direction='right') for x in X_train])
x_left = np.array([shift_image(x, direction='left') for x in X_train])

In [35]:
X_enlarged = np.concatenate(
    (X_train,
    x_up.reshape(60000, 784),
    x_down.reshape(60000, 784),
    x_right.reshape(60000, 784),
    x_left.reshape(60000, 784))
)

In [38]:
# Enlarge y too!
# np.tile(np.array([1,2]), 2)
y_enlarged = np.tile(y_train, 5)

In [40]:
# Now train the model!
from sklearn.neighbors import KNeighborsClassifier
optimal_knn2 = KNeighborsClassifier(
    n_neighbors=4, 
    weights='distance'
)

optimal_knn2.fit(X_enlarged, y_enlarged)

KNeighborsClassifier(n_neighbors=4, weights='distance')

In [41]:
# Evaluate *accuracy* on test data
y_knn2_pred = optimal_knn2.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_knn2_pred)

0.9763

The accuracy improved, but just a little bit. BTW, it's literally the same metric value that appears on the solutions, so my solution is fine.

3. Tackle the Titatnic dataset. The goal is to predict whether or not a passenger survived based on attributes such as their age, sex, passenger class, where they embarked and so on.

In [58]:
# loading the data
import urllib.request

TITANIC_PATH = os.path.join("datasets", "titanic")
DOWNLOAD_URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/titanic/"

def fetch_titanic_data(url=DOWNLOAD_URL, path=TITANIC_PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    for filename in ("train.csv", "test.csv"):
        filepath = os.path.join(path, filename)
        if not os.path.isfile(filepath):
            print("Downloading", filename)
            urllib.request.urlretrieve(url + filename, filepath)

fetch_titanic_data()    

In [59]:
import pandas as pd

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [60]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

In [61]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


The attributes have the following meaning:
* **PassengerId**: a unique identifier for each passenger
* **Survived**: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
* **Pclass**: passenger class.
* **Name**, **Sex**, **Age**: self-explanatory
* **SibSp**: how many siblings & spouses of the passenger aboard the Titanic.
* **Parch**: how many children & parents of the passenger aboard the Titanic.
* **Ticket**: ticket id
* **Fare**: price paid (in pounds)
* **Cabin**: passenger's cabin number
* **Embarked**: where the passenger embarked the Titanic

Let's explicitly set the `PassengerId` column as the index column:

In [62]:
train_data = train_data.set_index("PassengerId")
test_data = test_data.set_index("PassengerId")

In [63]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


It's better to ignore Name and Ticket because it's difficult to convert them into useful information. 
It's necessary to implemennt an imputation strategy for Age missing values. Idea: input the median according to Sex and Pclass.

In [64]:
X_train = train_data.drop("Survived" ,axis= 1)
y_train = train_data.Survived

In [65]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Name      891 non-null    object 
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Ticket    891 non-null    object 
 7   Fare      891 non-null    float64
 8   Cabin     204 non-null    object 
 9   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 76.6+ KB


In [66]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

num_attribs = ['Age', 'SibSp', 'Parch', 'Fare']
num_indexes = X_train.columns.get_indexer(num_attribs)
cat_attribs = ['Sex', 'Embarked', 'Pclass']
cat_indexes = X_train.columns.get_indexer(cat_attribs)

num_pipeline = Pipeline([
    #('imputer', GroupImputer(group_cols = ['Sex', 'Pclass'], target = "Age", metric="median")),
    #('imputer2', GroupImputer(group_cols = ['Sex', 'Pclass'], target = "Fare", metric="median")),
    ("selector", ColumnTransformer([
        ("selector", "passthrough", num_indexes)
    ], remainder="drop")),
    ('imputer', SimpleImputer()),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([("selector", ColumnTransformer([
        ("selector", "passthrough", cat_indexes)
    ], remainder="drop")),
                        ('oneHot',OneHotEncoder(categories='auto'))])

In [67]:
# Combine pipeline for numeric and pipeline for categorical
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)])

X_train2 = full_pipeline.fit_transform(X_train)

At this point, I have succesfully:
- Imputed missing values based on group means
- Scaled numeric features
- One-hot-encoded categorical variables

Next: do `RandomSearchCV` with Random Forest model :)

In [68]:
from sklearn.ensemble import RandomForestClassifier

forest_clas = RandomForestClassifier(random_state=42)

Looking at default parameters

In [69]:
print(forest_clas.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


In [70]:
from sklearn.model_selection import RandomizedSearchCV
# Tutorial I followed here: 
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 15)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(6, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [71]:
# Use the random grid to search for best hyperparameters

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = forest_clas,
                               param_distributions = random_grid,
                               n_iter = 100,
                               cv = 5,
                               verbose=1,
                               random_state=42,
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train2, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [6, 16, 26, 37, 47, 58, 68,
                                                      78, 89, 99, 110, None],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 235, 371, 507,
                                                         642, 778, 914, 1050,
                                                         1185, 1321, 1457, 1592,
                                                         1728, 1864, 2000]},
                   random_state=42, verbose=1)

In [72]:
rf_random.best_params_

{'n_estimators': 778,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 58,
 'bootstrap': True}

Now I can narrow down the hyperparameter search around the parameters found with `RandomizedSearchCV`

In [73]:
rf_random.best_score_

0.8339150084740444

In [74]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [56, 57, 58, 59, 60],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [3, 4, 5, 6, 7],
    'n_estimators': [760, 770, 780, 790, 800]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = forest_clas, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 1)

grid_search.fit(X_train2, y_train)

Fitting 5 folds for each of 375 candidates, totalling 1875 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [56, 57, 58, 59, 60],
                         'max_features': ['auto'],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [3, 4, 5, 6, 7],
                         'n_estimators': [760, 770, 780, 790, 800]},
             verbose=1)

In [75]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 56,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 770}

In [76]:
grid_search.best_score_

0.8339150084740444

In [77]:
# Now I should train the model on the whole training dataset
final_rf = RandomForestClassifier(random_state=42)
final_rf.set_params(**grid_search.best_params_)

final_rf.fit(X_train2, y_train)

RandomForestClassifier(max_depth=56, min_samples_split=5, n_estimators=770,
                       random_state=42)

In [78]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


In [79]:
X_test = full_pipeline.transform(test_data)
y_pred = final_rf.predict(X_test)

In [80]:
y_pred[:5]

array([0, 0, 0, 1, 1])

In [81]:
# Extract PassengerId
passengerid = test_data.reset_index().PassengerId

In [82]:
type(passengerid[:5])

pandas.core.series.Series

In [83]:
data = {'Survived':y_pred}
 
# Creates pandas DataFrame.
df = pd.DataFrame(data, index = passengerid)

In [84]:
df.to_csv('outputs/titanic_pred.csv')

How could I improve this?

- Use SVM instead
- Better imputation of NAs
- Better hyperparamter search
- Turn elements of the data pre-processing into hyperparameters
- Create ranges or categorical values with some of the numerical variables
- Try to extract some useful information from the names? e.g. common names?
- Use first letter of Cabin as a categorical variable