# Exercises chapter 3

In [1]:
import sklearn 
import numpy as np
import pandas as pd
import os

np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

# Helper function to save the images
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

1. Build a MINST classifier with 97% accuracy on test set 
Hint: use `KNeighborsClassifier` and do grid search over hyperparameters (`weights` and `n_neighbors`)

In [2]:
# Importing the famous MINST dataset
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)

In [4]:
# Pre-processing + splitting into test and train
X, y = mnist["data"], mnist["target"]
y = y.astype(np.uint8) #converting response variable to integer
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [6]:
# Instantiating the KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
my_knn = KNeighborsClassifier()

In [8]:
# Grid Search over hyperparameters (weights and n_neighbors)
from sklearn.model_selection import GridSearchCV

param_grid_knn = {
    'weights': ('uniform', 'distance'),
    'n_neighbors': list(range(1, 10))
}

In [9]:
grid = GridSearchCV(my_knn, param_grid_knn, cv = 5, scoring = 'accuracy')
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                         'weights': ('uniform', 'distance')},
             scoring='accuracy')

In [10]:
# examine the best model
# Source of the code: https://www.ritchieng.com/machine-learning-efficiently-search-tuning-param/ 
# Single best score achieved across all params (k)
print(grid.best_score_)

# Dictionary containing the parameters (k) used to generate that score
print(grid.best_params_)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(grid.best_estimator_)

0.9716166666666666
{'n_neighbors': 4, 'weights': 'distance'}
KNeighborsClassifier(n_neighbors=4, weights='distance')


In [11]:
# Instantiating and fitting the model with the best parameters
optimal_knn = KNeighborsClassifier(
    n_neighbors=4, 
    weights='distance'
)

optimal_knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=4, weights='distance')

In [12]:
# Evaluate *accuracy* on test data
y_knn_pred = optimal_knn.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_knn_pred)

0.9714

2. Writing function that shifts images, then using it to augment the dataset and train the model on the augmented data

In [27]:
from scipy.ndimage import shift

#Example:
#shift(image, [2, 1], cval=0) shifts the image 2 pixels down and 1 pixel to the right

# Creating the function
def shift_image(image, direction):    
    if direction == 'left':
        shifting_arr = [0, -1]
    elif direction == 'right':
        shifting_arr = [0, 1]
    elif direction == 'up':
        shifting_arr = [-1, 0]
    elif direction == 'down':
        shifting_arr = [1, 0]
        
    return(shift(image.reshape(28, 28), shifting_arr, cval=0))

# How can I access an image?? Look for an example
# X_train[0]

In [28]:
# Augmenting the dataset
x_up = np.array([shift_image(x, direction='up') for x in X_train])
x_down = np.array([shift_image(x, direction='down') for x in X_train])
x_right = np.array([shift_image(x, direction='right') for x in X_train])
x_left = np.array([shift_image(x, direction='left') for x in X_train])

In [35]:
X_enlarged = np.concatenate(
    (X_train,
    x_up.reshape(60000, 784),
    x_down.reshape(60000, 784),
    x_right.reshape(60000, 784),
    x_left.reshape(60000, 784))
)

In [38]:
# Enlarge y too!
# np.tile(np.array([1,2]), 2)
y_enlarged = np.tile(y_train, 5)

In [40]:
# Now train the model!
from sklearn.neighbors import KNeighborsClassifier
optimal_knn2 = KNeighborsClassifier(
    n_neighbors=4, 
    weights='distance'
)

optimal_knn2.fit(X_enlarged, y_enlarged)

KNeighborsClassifier(n_neighbors=4, weights='distance')

In [41]:
# Evaluate *accuracy* on test data
y_knn2_pred = optimal_knn2.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_knn2_pred)

0.9763

The accuracy improved, but just a little bit. BTW, it's literally the same metric value that appears on the solutions, so my solution is fine.

3. Tackle the Titatnic dataset. The goal is to predict whether or not a passenger survived based on attributes such as their age, sex, passenger class, where they embarked and so on.

In [2]:
# loading the data
import urllib.request

TITANIC_PATH = os.path.join("datasets", "titanic")
DOWNLOAD_URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/titanic/"

def fetch_titanic_data(url=DOWNLOAD_URL, path=TITANIC_PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    for filename in ("train.csv", "test.csv"):
        filepath = os.path.join(path, filename)
        if not os.path.isfile(filepath):
            print("Downloading", filename)
            urllib.request.urlretrieve(url + filename, filepath)

fetch_titanic_data()    

In [3]:
import pandas as pd

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [4]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

In [5]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


The attributes have the following meaning:
* **PassengerId**: a unique identifier for each passenger
* **Survived**: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
* **Pclass**: passenger class.
* **Name**, **Sex**, **Age**: self-explanatory
* **SibSp**: how many siblings & spouses of the passenger aboard the Titanic.
* **Parch**: how many children & parents of the passenger aboard the Titanic.
* **Ticket**: ticket id
* **Fare**: price paid (in pounds)
* **Cabin**: passenger's cabin number
* **Embarked**: where the passenger embarked the Titanic

Let's explicitly set the `PassengerId` column as the index column:

In [6]:
train_data = train_data.set_index("PassengerId")
test_data = test_data.set_index("PassengerId")

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


It's better to ignore Name and Ticket because it's difficult to convert them into useful information. 
It's necessary to implemennt an imputation strategy for Age missing values. Idea: input the median according to Sex and Pclass.

In [None]:
# TODO: 1) INPUT MISSING VALUES, 2) SCALE, 3) ONE-HOT-ENCODING

In [19]:
# Missing value imputation according to group mean 
# Here I should use the fit_transform thing
'''
imputer = SimpleImputer(
    missing_values=np.nan,
    add_indicator=True,
    strategy="median"
)
'''
# But I have to code a custom Imputer
# https://towardsdatascience.com/coding-a-custom-imputer-in-scikit-learn-31bd68e541de 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

class GroupImputer(BaseEstimator, TransformerMixin):
    '''
    Class used for imputing missing values in a pd.DataFrame using either mean or median of a group.
    
    Parameters
    ----------    
    group_cols : list
        List of columns used for calculating the aggregated value 
    target : str
        The name of the column to impute
    metric : str
        The metric to be used for remplacement, can be one of ['mean', 'median']

    Returns
    -------
    X : array-like
        The array with imputed values in the target column
    '''
    def __init__(self, group_cols, target, metric='mean'):
        
        assert metric in ['mean', 'median'], 'Unrecognized value for metric, should be mean/median'
        assert type(group_cols) == list, 'group_cols should be a list of columns'
        assert type(target) == str, 'target should be a string'
        
        self.group_cols = group_cols
        self.target = target
        self.metric = metric
    
    def fit(self, X, y=None):
        
        assert pd.isnull(X[self.group_cols]).any(axis=None) == False, 'There are missing values in group_cols'
        
        impute_map = X.groupby(self.group_cols)[self.target].agg(self.metric) \
                                                            .reset_index(drop=False)
        
        self.impute_map_ = impute_map
        
        return self 
    
    def transform(self, X, y=None):
        
        # make sure that the imputer was fitted
        check_is_fitted(self, 'impute_map_')
        
        X = X.copy()
        
        for index, row in self.impute_map_.iterrows():
            ind = (X[self.group_cols] == row[self.group_cols]).all(axis=1)
            X.loc[ind, self.target] = X.loc[ind, self.target].fillna(row[self.target])
        
        return X.values
    
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]


In [31]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_attribs = ['Survived', 'Age', 'SibSp', 'Parch', 'Fare']
num_indexes = train_data.columns.get_indexer(num_attribs)
cat_attribs = ['Sex', 'Embarked', 'Pclass']
cat_indexes = train_data.columns.get_indexer(cat_attribs)

num_pipeline = Pipeline([
    ('imputer', GroupImputer(group_cols = ['Sex', 'Pclass'], target = "Age", metric="median")),
    ("selector", ColumnTransformer([
        ("selector", "passthrough", num_indexes)
    ], remainder="drop")),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([("selector", ColumnTransformer([
        ("selector", "passthrough", cat_indexes)
    ], remainder="drop")),
                        ('oneHot',OneHotEncoder(categories='auto'))])

In [32]:
# Combine pipeline for numeric and pipeline for categorical
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)])


fp_result= full_pipeline.fit_transform(train_data)

In [33]:
fp_result

<891x14 sparse matrix of type '<class 'numpy.float64'>'
	with 7128 stored elements in Compressed Sparse Row format>

In [30]:
train_data.columns.get_indexer(num_attribs)

array([0, 4, 5, 6, 8])