In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Introduction to scikit-learn(sklearn)

This notebook demonstrate some of the most useful functions of the beautiful scikit-learn library

What we are going to cover:

0. An end to end scikit-learn workflow
1. Getting the data ready
2. Choose the right estimator/algorithm for our problems.
3. Fit the model/algorithm and use it to make predictions on our data
4. Evaluate a model
5. Improve a model
6. Save and load a trained model
7. Putting it all together

## 0. An end to end scikit-learn workflow

In [2]:
# 1. Getting the data ready
heart_disease = pd.read_csv('data/heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# create x (features matrix)
x = heart_disease.drop('target', axis=1)
 
# create y (labels)
y = heart_disease['target']

In [4]:
# import warnings
# warnings.filterwarnings('default') # or use ignore to ignore warnings

In [5]:
# 2. choose the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

# We'll keep the default hyperparameters
# clf.get_params()

In [6]:
# Fit the model to the training data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [7]:
import sklearn
sklearn.show_versions()


System:
    python: 3.8.18 (default, Sep 11 2023, 13:40:15)  [GCC 11.2.0]
executable: /home/kiprotich/Desktop/ML-course/heart-disease-project/env/bin/python
   machine: Linux-6.2.0-36-generic-x86_64-with-glibc2.17

Python dependencies:
      sklearn: 1.3.0
          pip: 23.3
   setuptools: 68.0.0
        numpy: 1.24.3
        scipy: 1.10.1
       Cython: None
       pandas: 2.0.3
   matplotlib: 3.7.2
       joblib: 1.2.0
threadpoolctl: 2.2.0

Built with OpenMP: True

threadpoolctl info:
       filepath: /home/kiprotich/Desktop/ML-course/heart-disease-project/env/lib/libmkl_rt.so.2
         prefix: libmkl_rt
       user_api: blas
   internal_api: mkl
        version: 2023.1-Product
    num_threads: 2
threading_layer: intel

       filepath: /home/kiprotich/Desktop/ML-course/heart-disease-project/env/lib/libiomp5.so
         prefix: libiomp
       user_api: openmp
   internal_api: openmp
        version: None
    num_threads: 4

       filepath: /home/kiprotich/Desktop/ML-course/heart-

In [8]:
clf.fit(x_train, y_train);

In [9]:
x_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
90,48,1,2,124,255,1,1,175,0,0.0,2,2,2
111,57,1,2,150,126,1,1,173,0,0.2,2,1,3
196,46,1,2,150,231,0,1,147,0,3.6,1,0,2
286,59,1,3,134,204,0,1,162,0,0.8,2,2,2
178,43,1,0,120,177,0,0,120,1,2.5,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,41,1,1,110,235,0,1,153,0,0.0,2,0,2
91,57,1,0,132,207,0,1,168,1,0.0,2,0,3
280,42,1,0,136,315,0,1,125,1,1.8,1,0,1
174,60,1,0,130,206,0,0,132,1,2.4,1,2,3


In [10]:
# Make a prediction
# y_label = clf.predict(np.array([0, 2, 3, 4]))

In [11]:
y_preds = clf.predict(x_test)
y_preds

array([1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0])

In [12]:
y_test

1      1
14     1
245    0
131    1
198    0
      ..
164    1
291    0
30     1
159    1
184    0
Name: target, Length: 61, dtype: int64

In [13]:
# 4. Evaluate the model on the train data and  test data
clf.score(x_train, y_train)

1.0

In [14]:
clf.score(x_test, y_test)

0.8032786885245902

In [15]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.75      0.68      0.71        22
           1       0.83      0.87      0.85        39

    accuracy                           0.80        61
   macro avg       0.79      0.78      0.78        61
weighted avg       0.80      0.80      0.80        61



In [16]:
confusion_matrix(y_test, y_preds)

array([[15,  7],
       [ 5, 34]])

In [17]:
accuracy_score(y_test, y_preds)

0.8032786885245902

In [18]:
# 5. Improve a model
# Try different amount of n_estimators
np.random.seed(42)
for i in range(10, 100, 10):
    print(f'Trying model with {i} estimators...')
    clf = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    print(f'Model accuracy on test set: {clf.score(x_test, y_test) * 100:.2f}%')
    print('')

Trying model with 10 estimators...
Model accuracy on test set: 80.33%

Trying model with 20 estimators...
Model accuracy on test set: 80.33%

Trying model with 30 estimators...
Model accuracy on test set: 78.69%

Trying model with 40 estimators...
Model accuracy on test set: 81.97%

Trying model with 50 estimators...
Model accuracy on test set: 80.33%

Trying model with 60 estimators...
Model accuracy on test set: 85.25%

Trying model with 70 estimators...
Model accuracy on test set: 77.05%

Trying model with 80 estimators...
Model accuracy on test set: 80.33%

Trying model with 90 estimators...
Model accuracy on test set: 80.33%



In [19]:
# Save a model and load it
import pickle

pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))

In [20]:
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
loaded_model.score(x_test, y_test)

0.8032786885245902

## 1. Getting our data ready to be used with machine learning

Three main things we have to do:

    1. Split the data into features and labels (usually `x` & `y`)
    2. Filling (also called imputing) or distregarding missing values
    3. converting non-numerical values to numerical values (also called feature encoding)

In [21]:
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [22]:
x = heart_disease.drop('target', axis=1)
x.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [23]:
y = heart_disease['target']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [24]:
# split the data into train and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2)

In [25]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

## Make sure it's all numerical

In [26]:
car_sales = pd.read_csv('data/car-sales-extended.csv')
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [27]:
len(car_sales)

1000

In [28]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [29]:
# split into x and y
x = car_sales.drop('Price', axis=1)
y =car_sales['Price']

# Split into training and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [30]:
# Build machine learning model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

ValueError: could not convert string to float: 'Toyota'

In [None]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                 one_hot,
                                 categorical_features)],
                               remainder='passthrough')
transformed_x = transformer.fit_transform(x)
transformed_x

In [None]:
pd.DataFrame(transformed_x)

In [None]:
dummies = pd.get_dummies(car_sales[['Make', 'Colour', 'Doors']], dtype=int)
dummies

In [None]:
# Let's fit the model
np.random.seed(42)
x_train, x_test, y_train, y_test = train_test_split(transformed_x,
                                                   y,
                                                   test_size=0.2)

model.fit(x_train, y_train)

In [None]:
model.score(x_test, y_test)

In [None]:
print(sklearn.__version__)

###  1.2 What if there were missing values

1. Fill them with some values(also known as imputation)
2. Remove the sample with missing data altogether

In [None]:
car_sales_missing = pd.read_csv('data/car-sales-extended-missing-data.csv')
car_sales_missing

In [None]:
car_sales_missing.isna().sum()

In [None]:
# Create x & y
x = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

In [None]:
# let's try convert our data to numbers
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                 one_hot,
                                 categorical_features)],
                               remainder='passthrough')
transformed_x = transformer.fit_transform(x)
transformed_x

In [None]:
car_sales_missing

### option 1: Fill missing data with pandas

In [None]:
# Fill the 'Make' column
car_sales_missing['Make'].fillna('missing', inplace=True)

# Fill the 'Colour' column
car_sales_missing['Colour'].fillna('missing', inplace=True)

# Fill the 'Odometer' column
car_sales_missing['Odometer (KM)'].fillna(car_sales_missing['Odometer (KM)'].mean(), inplace=True)
                                          
# Fill the 'Doors' column
car_sales_missing['Doors'].fillna(4, inplace=True)

In [None]:
# check the DataFrame again
car_sales_missing.isna().sum()

In [None]:
# Remove rows with missing Price values
car_sales_missing.dropna(inplace=True)

In [None]:
car_sales_missing.isna().sum()

In [None]:
# Create x & y
x = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

In [None]:
# let's try convert our data to numbers
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                 one_hot,
                                 categorical_features)],
                               remainder='passthrough')
transformed_x = transformer.fit_transform(car_sales_missing)
transformed_x

In [None]:
pd.DataFrame(transformed_x)

## Option 2: Fill our data with scikit-learn

In [None]:
car_sales_missing = pd.read_csv('data/car-sales-extended-missing-data.csv')
car_sales_missing

In [None]:
car_sales_missing.isna().sum()

In [None]:
# Drop rows with empty 'Price'
car_sales_missing.dropna(subset=['Price'], inplace=True)

In [None]:
car_sales_missing.isna().sum()

In [None]:
# split into x and y
x = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

In [None]:
# Fill missing data with scikit=learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' & numerical values with mean
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
door_imputer = SimpleImputer(strategy='constant', fill_value=4)
num_imputer = SimpleImputer(strategy='mean')

#Define columns
cat_features = ['Make', 'Colour']
door_feature = ['Doors']
num_features = ['Odometer (KM)']

# Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ('cat_imputer', cat_imputer, cat_features),
    ('door_imputer', door_imputer, door_feature),
    ('num_imputer', num_imputer, num_features)
])

# Transform the data
filled_x = imputer.fit_transform(x)
filled_x

In [None]:
car_sales_filled = pd.DataFrame(filled_x, columns=['Make', 'Colour', 'Doors', 'Odometer (KM)'])
car_sales_filled.isna().sum()

In [None]:
# let's try convert our data to numbers
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                 one_hot,
                                 categorical_features)],
                               remainder='passthrough')
transformed_x = transformer.fit_transform(car_sales_filled)
transformed_x

In [None]:
# Now we got our data as Numbers and filled (no missing values)
# Let's fit a model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(transformed_x,
                                                   y,
                                                   test_size=0.2)

model = RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

## 2. Choosing the right estimator/algorithm for your problem

Some things to note:

    * Sklearn refers to machine learning models, algorithm as estimators
    * Classification problem - predicting a category (heart disease or not)
        * Sometimes you'll see `clf` (short for classifier) used as a classification estimators
    * Regression problem - predicting a number (selling price of a car)

### 2.1 Picking machine learning model for a regression problem
Let's use the Carlifonia Housing dataset.

In [None]:
# Get Carlifonia Housing dataset
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing

In [None]:
housing_df = pd.DataFrame(housing['data'], columns=housing['feature_names'])
housing_df

In [None]:
housing_df['target'] = housing['target']
housing_df.head()

In [None]:
# housing_df = housing_df.drop('MedHOuseVal', axis=1)
# housing_df.head()

In [None]:
# Import algorithm
from sklearn.linear_model import Ridge

# Setup random seed
np.random.seed(42)

#creating the data
x = housing_df.drop('target', axis=1)
y = housing_df['target']
# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Instatiate and fit the model (on the training set)
model = Ridge()
model.fit(x_train, y_train)

# check the score of the model (on the test set)
model.score(x_test, y_test)

What if `Ridge` didn't work or the source didn't fit our needs?

We could alway try a different model

How about we try an ensemble model (an ensemble is combination of similar models and make better prediction than just a single mode?

Sklearn's ensemble models can  be found here: https://scikit-learn.org/stable/modules/ensemble.html

In [None]:
# Import the RandomForestRegressor model class from the ensemble module
from sklearn.ensemble import RandomForestRegressor

# Setup random seed
np.random.seed(42)

#creating the data
x = housing_df.drop('target', axis=1)
y = housing_df['target']
# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Create a Random Forest model
model = RandomForestRegressor()
model.fit(x_train, y_train)

# check the score of the model (on the test set)
model.score(x_test, y_test)

## 2.2 Picking a machine learning model for Classification

let's go to the map: https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [None]:
heart_disease = pd.read_csv('data/heart-disease.csv')
heart_disease.head()

Consulting the map and it says to try `LinearSVC`

In [None]:
# Import the LinearSVC
from sklearn.svm import LinearSVC

# Setup random seed
np.random.seed(42)

#creating the data
x = heart_disease.drop('target', axis=1)
y = heart_disease['target']
# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Initiate LInearSVC
clf = LinearSVC()
clf.fit(x_train, y_train)

# check the score of the model (on the test set)
clf.score(x_test, y_test)

In [None]:
# Import the RandomForestClassifier model class from the ensemble module
from sklearn.ensemble import RandomForestClassifier

# Setup random seed
np.random.seed(42)

#creating the data
x = heart_disease.drop('target', axis=1)
y = heart_disease['target']
# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Create a Random Forest model
clf = RandomForestClassifier()
clf.fit(x_train, y_train)

# check the score of the model (on the test set)
clf.score(x_test, y_test)

In [None]:
# 3. Fit the model/algorithm on our data and use it to make predictions
# Import the RandomForestClassifier model class from the ensemble module
from sklearn.ensemble import RandomForestClassifier

# Setup random seed
np.random.seed(42)

#creating the data
x = heart_disease.drop('target', axis=1)
y = heart_disease['target']
# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Create Random Forest Classifier
clf = RandomForestClassifier()
# Fit the model to the data (Training machine learning model)
clf.fit(x_train, y_train)

# check the score of the model (on the test set) use the pattern machine learning model has learned
clf.score(x_test, y_test)

## 3.2 Make Predictions using machine learning model

# Use a trained model to make prediction

2 ways to make prediction

1. `predict()`
2. `predict_proba()`

In [None]:
# Use a trained model to make predictions
clf.predict(np.array([1, 7, 8, 3, 4])) # this doesn't work...

In [None]:
clf.predict(x_test)

In [None]:
np.array([y_test])

In [None]:
# Compare predictions to truth labels to evaluate the model
y_preds = clf.predict(x_test)
np.mean(y_preds == y_test)

In [None]:
clf.score(x_test, y_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

Make predictions with predict_proba()

In [None]:
# predict_proba returns probabilities of a classification label
clf.predict_proba(x_test[:5])

In [None]:
# let's predict() on the same data
clf.predict(x_test[:5])

In [None]:
# Import the RandomForestRegressor model class from the ensemble module
from sklearn.ensemble import RandomForestRegressor

# Setup random seed
np.random.seed(42)

#creating the data
x = housing_df.drop('target', axis=1)
y = housing_df['target']
# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Create a Random Forest model
model = RandomForestRegressor()
# Fit the model to the data
model.fit(x_train, y_train)

# Make a prediction
y_preds = model.predict(x_test)

In [None]:
y_preds[:10]

In [None]:
np.array(y_test[:10])

In [None]:
len(y_preds)

In [None]:
# Compare predictions to the truth
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)

In [None]:
housing_df['target'] 

# 4. Evaluating machine learning model

Three ways to evaluate Scikit-learn models/estimators
    
    1. Estimator's built-in `score()` method
    2. The `scoring` parameter
    3. Problem-specific metric fuction.

## 4.1 Evaluating a model with the `score` method

In [None]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

# Create x and y
x = heart_disease.drop('target', axis=1)
y = heart_disease['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#Create classifier model instance
clf = RandomForestClassifier()

# fit classifier to training data
clf.fit(x_train, y_train)

# Fit the model to the data
clf.score(x_test, y_test)

In [None]:
clf.score(x_train, y_train)

### Let's use the `score()` on our regression problem...

In [None]:
# Import the RandomForestRegressor model class from the ensemble module
from sklearn.ensemble import RandomForestRegressor

# Setup random seed
np.random.seed(42)

#creating the data
x = housing_df.drop('target', axis=1)
y = housing_df['target']
# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Create a Random Forest model
model = RandomForestRegressor()

# Fit the model to training data
model.fit(x_train, y_train)

# Fit the model to test data
model.score(x_test, y_test)

In [None]:
model.score(x_train, y_train)

In [None]:
housing_df.head()

## 4.2 Evaluating a model using `scoring` parameter

In [None]:
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

# Create x and y
x = heart_disease.drop('target', axis=1)
y = heart_disease['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#Create classifier model instance
clf = RandomForestClassifier()

# fit classifier to training data
clf.fit(x_train, y_train)

# Fit the model to the data
clf.score(x_test, y_test)

In [None]:
cross_val_score(clf, x, y, cv=10)

In [None]:
np.random.seed(41)

# Single training and test split score
clf_single_score = clf.score(x_test, y_test)

# Take the mean of 5-fold cross-validation process
clf_cross_val_score = np.mean(cross_val_score(clf, x, y, cv=5))

#compare the two
clf_single_score, clf_cross_val_score

In [None]:
# Scoring parameter set to None by default
cross_val_score(clf, x, y, cv=5, scoring=None)

### 4.2.1 Classification model evaluation metrics

1. Accuracy
2. Area under ROC curve'
3. Confusion matrix
4. Classification report

**Accuracy**

In [None]:
heart_disease.head()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

x = heart_disease.drop('target', axis=1)
y = heart_disease['target']

clf = RandomForestClassifier()
cross_val_score = cross_val_score(clf, x, y, cv=5)

In [None]:
np.mean(cross_val_score)

In [None]:
print(f'Heart Disease Classifier Accuracy: {np.mean(cross_val_score) * 100:.2f}%')

**Area under Receiver Operating Characteristic (AUC/ROC)**

* Area Under Curve (AUC)
* ROC curve

ROC curves are a comparison of a model's true positive rate (tpr) verses a model's false positive rate (fpr)

* True positive = model predicts 1 when truth is 1
* False positive = model predicts 1 when truth is 0
* True negative = model predicts o when truth is 0
* False negative = model predicts 0 when truth is 1

In [None]:
# Create x_test... etc
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
from sklearn.metrics import roc_curve

# fit the classifier
clf.fit(x_train, y_train)

# Make predictions with probabilities
y_probs = clf.predict_proba(x_test)

y_probs[:10], len(y_probs)

In [None]:
y_probs_positive = y_probs[:, 1]
y_probs_positive[:10]

In [None]:
# calculate fpr, tpr and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive)

# check the false positives
fpr

In [None]:
# Create a function for plotting ROC curves
import matplotlib.pyplot as plt

def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positive rate (fpr) and true positive rate (tpr) of a model
    """
    # Plot roc curve
    plt.plot(fpr, tpr, color='orange', label="ROC")
    # Plot line with no predictive power (baseline)
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--', label='Guessing')
    
    # Customized the plot
    plt.xlabel('False positive rate (fpr)')
    plt.ylabel('True positive rate (tpr)')
    plt.title(('Receiver Operating Characteristic (ROC) curve'))
    plt.legend()
    plt.show()
    
plot_roc_curve(fpr, tpr)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_probs_positive)

In [None]:
# Plot perfect ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test,y_test)
plot_roc_curve(fpr, tpr)

In [None]:
# Perfect AUC score
roc_auc_score(y_test, y_test)

**Confusion matrix**

The next way to evaluate a classification model is by using a confusion matrix

A confusion matrix is a quick way to compare the labels a model predicts and the actual labels it was supposed to predict.
In essence , giving you an idea of where the model is getting confused.

In [None]:
from sklearn.metrics import confusion_matrix

y_preds = clf.predict(x_test)

confusion_matrix(y_test, y_preds)

In [None]:
# Visualize confusion matrix with pd.crosstab()
pd.crosstab(y_test,
           y_preds,
           rownames=['Actual Labels'],
            colnames=['Predicted Labels'])

In [None]:
# How to install a conda package into the current environment from a Jupyter Notebook
import sys
!conda install --yes --prefix {sys.prefix} seaborn

In [None]:
# Make our confusion matrix more visual with Seaborn's heatmap()
import seaborn as sns

# Set the font scale
sns.set(font_scale=1.5)

# Create a confusion matrix
conf_mat = confusion_matrix(y_test, y_preds)

# Plot it using Seaborn
sns.heatmap(conf_mat);

### Creating a confusion matrix using Scikit-learn

In [None]:
clf

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(estimator=clf, X=x, y=y)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_true=y_test, y_pred=y_preds)

## Classification Report

In [None]:
from  sklearn.metrics import classification_report

print(classification_report(y_test, y_preds))

In [None]:
# Where precision and recall becomes valuable
disease_true = np.zeros(10000)
disease_true[0] = 1 # only one positive case

disease_preds = np.zeros(10000) # model predicts every case as 0

pd.DataFrame(classification_report(disease_true,
                                   disease_preds,
                                   output_dict=True))

To summarize classification metrics:

* **Accuracy** is a good measure to start with if all classes are balanced (e.g. same amount of samples which are labelled with 0 or 1)
* **Precision** and **recall** becomes more important when classes are imbalanced.
* If false positive predictions are worse than false negatives, aim for higher precision.
* **F1-score** is a combination of precision and recall.

### 4.2.2 Regression model evaluation metrics

Model evaluation metrics documentation: https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

The one we are going to cover are:
1. R^2 
2. Mean absolute error(MAE)
3. Mean square error(MSE)

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

x = housing_df.drop('target', axis=1)
y = housing_df['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(x_train, y_train)

model.score(x_test, y_test)

In [None]:
from sklearn.metrics import r2_score

# Fill an array with y_test mean
y_test_mean = np.full(len(y_test), y_test.mean())

In [None]:
y_test_mean[:10]

In [None]:
r2_score(y_true=y_test,
        y_pred=y_test_mean)

**Mean absolute error (MAE)**

MAE is the average of the absolute differences between predictions and actual values.
It gives you an ides of how wrong the models predicts are.

In [None]:
from sklearn.metrics import mean_absolute_error

y_preds = model.predict(x_test)
mae = mean_absolute_error(y_test, y_preds)
mae

In [None]:
df = pd.DataFrame(data={'actual values': y_test,
                       'predicted values': y_preds})
df['differences'] = df['predicted values'] - df['actual values']
df.head(10)

In [None]:
# MAE using formulas  and differences
abs(df['differences']).mean()

**Mean squared error (MSE)**

MSE is the mean of the squared of the errors between actual and predicted values

In [None]:
#MSE
from sklearn.metrics import mean_squared_error

y_preds = model.predict(x_test)
mse = mean_squared_error(y_test, y_preds)
mse

In [None]:
df['squared_differences'] = np.square(df['differences'])
df.head()

In [None]:
# Calculate MSE by hand
squared = np.square(df['differences'])
squared.mean()

In [None]:
df_large_error = df.copy()
df_large_error.iloc[0]['squared_differences'] = 16
df_large_error

In [None]:
df_large_error['squared_differences'].mean()

In [None]:
df_large_error.iloc[np.r_[1:100], 3] = 20
df_large_error

In [None]:
df_large_error['squared_differences'].mean()

### 4.2.3 Finally using the `scoring` parameter

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

x = heart_disease.drop('target', axis=1)
y = heart_disease['target']

clf = RandomForestClassifier()

In [None]:
np.random.seed(42)

# Cross-validation accuracy
cv_acc = cross_val_score(clf, x, y, cv=5, scoring=None)
cv_acc

In [None]:
# Cross-validated accuracy
print(f'The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%')

In [None]:
np.random.seed(42)

# Cross-validation accuracy
cv_acc = cross_val_score(clf, x, y, cv=5, scoring='accuracy')
cv_acc

In [None]:
# Cross-validated accuracy
print(f'The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%')

In [None]:
# Precision
np.random.seed(42)
cv_precision = cross_val_score(clf, x, y, cv=5, scoring='precision')
cv_precision

In [None]:
# Cross-validated precision
print(f'The cross-validated precision is: {np.mean(cv_precision)}')

In [None]:
# Recall
np.random.seed(42)
cv_recall = cross_val_score(clf, x, y, cv=5, scoring='recall')
cv_recall

In [None]:
# Cross-validated recall
print(f'The cross-validated recall is: {np.mean(cv_recall)}')

Let's see the scoring parameter being usign for a regression problem...

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

x = housing_df.drop('target', axis=1)
y = housing_df['target']

model = RandomForestRegressor()

In [None]:
np.random.seed(42)
cv_r2 = cross_val_score(model, x, y, cv=3, scoring=None)
np.mean(cv_r2)

In [None]:
# Mean squared error
cv_mse = cross_val_score(model, x, y, cv=5, scoring='neg_mean_squared_error')
np.mean(cv_mse)

In [None]:
cv_mse

In [None]:
# Mean absolute error
cv_mae = cross_val_score(model, x, y, cv=5, scoring='neg_mean_absolute_error')
np.mean(cv_mae)

In [None]:
cv_mae

## 4.3 Using different evaluation metrics as Scikit-learn fuctions

The 3rd way to evaluate scikit-learn machine learning models/estimators is to using `sklearn-metrics` module

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

#create x and y
x = heart_disease.drop('target', axis=1)
y = heart_disease['target']

# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Create the model
clf = RandomForestClassifier()

# Fit the model
clf.fit(x_train, y_train)

# Make predictions
y_preds = clf.predict(x_test)

# Evaluate model using evaluation functions
print('Classifier metrics on the test set')
print(f'Accuracy: {accuracy_score(y_test, y_preds)*100:.2f}%')
print(f'Precision: {precision_score(y_test, y_preds)*100:.2f}%')
print(f'Recall: {recall_score(y_test, y_preds)*100:.2f}%')
print(f'F1: {f1_score(y_test, y_preds)*100:.2f}%')

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

np.random.seed(42)

#create x and y
x = housing_df.drop('target', axis=1)
y = housing_df['target']

# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Create the model
model = RandomForestRegressor()

# Fit the model
model.fit(x_train, y_train)

# Make predictions
y_preds = model.predict(x_test)

# Evaluate model using evaluation functions
print('Regressor metrics on the test set')
print(f'R^2: {r2_score(y_test, y_preds)*100:.2f}')
print(f'MAE: {mean_absolute_error(y_test, y_preds)*100:.2f}')
print(f'MSE: {mean_squared_error(y_test, y_preds)*100:.2f}')

## 5. Improving a model

First predictions = baseline predictions.
First model = baseline model

From a data perspective:
    
    * Could we collect more data? (generally, the more data, the better)
    * Could we improve our data?
    
From a model perspective:
    
    * Is there a better model we could use
    * Could we improve the current model?

   **Hyperparameters vs parameters**
   * parameters = model these patterns in data
   * hyperparameters = settings on a model you can adjust to (potentially) improve its ability to find patterns
   
Three ways to adjust hyperparameters:
* By hand
* Randomly with RandomSearchCV
* Exhaustively with GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

In [None]:
clf.get_params()

## 5.1 Tuning hyperparameters by hand

Let's make 3 sets, training, validation and test.

In [None]:
clf.get_params()

**We're going to try and adjust the following:**
* `max_length`
* `max_features`
* `min_samples_leaf`
* `min_samples_split`
* `n_estimators`

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_preds(y_true, y_preds):
    """
    perform evaluation comparison on y_true labels vs y_preds labels on a classification
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    metric_dict = {'accuracy': round(accuracy, 2),
                  'precision': round(precision, 2),
                  'recall': round(recall, 2),
                  'f1': round(f1, 2)}
    
    print(f'Acc: {accuracy * 100:.2f}%')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 score: {f1:.2f}')
    
    return metric_dict

In [None]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

# shuffle the data
heart_disease_shuffled = heart_disease.sample(frac=1)

#split into x and y
x = heart_disease_shuffled.drop('target', axis=1)
y = heart_disease_shuffled['target']

# split into train, validation & test sets
train_split = round(0.7 * len(heart_disease_shuffled)) # 70 % of data
valid_split = round(train_split + 0.15 * len(heart_disease_shuffled)) # 15% of data
x_train, y_train = x[:train_split], y[:train_split]
x_valid, y_valid = x[train_split:valid_split], y[train_split:valid_split]
x_test, y_test = x[valid_split:], y[:valid_split]

clf = RandomForestClassifier()
clf.fit(x_train, y_train)

# Make baseline predictions
y_preds = clf.predict(x_valid)

# Evaluate the classifier on validation set
baseline_metrics = evaluate_preds(y_valid, y_preds)
baseline_metrics

In [None]:
np.random.seed(42)

# Create a second classifier with different hyperparameters
clf_2 = RandomForestClassifier(n_estimators=10)
clf_2.fit(x_train, y_train)

# Make predictions with different hyperparameters
y_preds_2 = clf_2.predict(x_valid)

# Evaluate the 2nd classifier
clf_2_metrics = evaluate_preds(y_valid, y_preds_2)

In [None]:
clf.get_params()

In [None]:
np.random.seed(42)

# Create a second classifier with different hyperparameters
clf_3 = RandomForestClassifier(max_depth=10)
clf_3.fit(x_train, y_train)

# Make predictions with different hyperparameters
y_preds_3 = clf_3.predict(x_valid)

# Evaluate the 2nd classifier
clf_3_metrics = evaluate_preds(y_valid, y_preds_3)

### 5.2 Hyperparameter tuning with RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

grid = {'n_estimators': [10, 100, 200, 500, 1000, 1200],
       'max_depth': [None, 5, 10, 20, 30],
       'max_features': ['auto', 'sqrt'],
       'min_samples_split': [2, 4, 6],
       'min_samples_leaf': [1, 2, 4]}

np.random.seed(42)

# split into x & y
x = heart_disease_shuffled.drop('target', axis=1)
y = heart_disease_shuffled['target']

# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_jobs=1)

# Setup RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf,
                           param_distributions=grid,
                           n_iter=10, # number of models to try
                           cv=5,
                           verbose=2)

# Fit the RandomizedSearchCV version of clf
rs_clf.fit(x_train, y_train)

In [None]:
rs_clf.best_params_

In [None]:
# Make predictions with the best hyperparameters
rs_y_preds = rs_clf.predict(x_test)

# Evaluate the predictions
rs_metrics = evaluate_preds(y_test, rs_y_preds)

### 5.3 Hyperparameters tuning with GridSearchCV

In [None]:
grid

In [None]:
grid_2 = {'n_estimators': [100, 200, 500],
          'max_depth': [None],
          'max_features': ['auto', 'sqrt'],
          'min_samples_split': [6],
          'min_samples_leaf': [1, 2]}

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split

np.random.seed(42)

# split into x & y
x = heart_disease_shuffled.drop('target', axis=1)
y = heart_disease_shuffled['target']

# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_jobs=1)

# Setup RandomizedSearchCV
gs_clf = GridSearchCV(estimator=clf,
                           param_grid=grid_2,
                           cv=5,
                           verbose=2)

# Fit the RandomizedSearchCV version of clf
gs_clf.fit(x_train, y_train)

In [None]:
gs_clf.best_params_

In [None]:
gs_y_preds = gs_clf.predict(x_test)

# evaluate the predictions
gs_metrics = evaluate_preds(y_test, gs_y_preds)

Let's compare our different models metrics

In [None]:
compare_metrics = pd.DataFrame({'baseline': baseline_metrics,
                               'clf_2': clf_2_metrics,
                               'random search': rs_metrics,
                               'grid search': gs_metrics})

compare_metrics.plot.bar(figsize=(10, 8));

## 6. Saving and loading machine learning models

Two ways to save and load machine learning models:

1. With python's `pickle` module
2. With the `joblib` module

**pickle**

In [None]:
import pickle

# Save an existing model to file
pickle.dump(gs_clf, open('gs_random_forest_model_1.pkl', 'wb'))

In [None]:
# Load a saves model
loaded_pickle_model = pickle.load(open('gs_random_forest_model_1.pkl', 'rb'))

In [None]:
# Make predictions
pickle_y_preds = loaded_pickle_model.predict(x_test)
evaluate_preds(y_test, pickle_y_preds)

**Joblib**

In [None]:
from joblib import dump, load

# Save model to file
dump(gs_clf, filename='gs_random_forest_model_1.joblib')

In [None]:
# Import a saved joblib model
loaded_joblib_model = load(filename='gs_random_forest_model_1.joblib')

In [None]:
# Make predictions
joblib_y_preds = loaded_joblib_model.predict(x_test)
evaluate_preds(y_test, joblib_y_preds)

## 7. Putting it all together!

In [None]:
data = pd.read_csv('data/car-sales-extended-missing-data.csv')
data

In [None]:
data.dtypes

In [None]:
data.isna().sum()

Steps we want to do (all in one cell)

1. Fill the missing data
2. Convert data to numbers
3. Build a model on the data

It's also possible to use `GridSearchCV` or `RandomizedSearchCV` with our `Pipeline`.

In [None]:
# Getting data ready
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# Setup random seed
import numpy as np
np.random.seed(42)

# Import data and drop rows with missing labels
data = pd.read_csv("data/car-sales-extended-missing-data.csv")
data.dropna(subset=["Price"], inplace=True)

# Define different features and transformer pipeline
categorical_features = ["Make", "Colour"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

door_feature = ["Doors"]
door_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=4))
])

numeric_features = ["Odometer (KM)"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])

# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(
                    transformers=[
                        ("cat", categorical_transformer, categorical_features),
                        ("door", door_transformer, door_feature),
                        ("num", numeric_transformer, numeric_features)
                    ])

# Creating a preprocessing and modelling pipeline
model = Pipeline(steps=[("preprocessor", preprocessor),
                        ("model", RandomForestRegressor())])

# Split data
X = data.drop("Price", axis=1)
y = data["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
# Use GridSearchCV with our regression Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

pipe_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'model__n_estimators': [100, 1000],
    'model__max_depth': [None, 5],
    'model__min_samples_split': [2, 4]
}

gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)
gs_model.fit(x_train, y_train)

In [None]:
gs_model.score(x_test, y_test)