In [2]:
## Introduction to Scikit-learn ( sklearn )

#This notebook demostrates some of the most useful functions of the most beautiful Scikit-Learn library.

#What we're going to cover:
#0. An end-to-end Scikit-Learn workflow
#1. Getting the data ready
#2. Choose the right estimator/algorithm for our problems
#3. Fit the model/algorithm and use it to make predictions on our data
#4. Evaluating a model
#5. Improve a model
#6. Save and load trained model
#7. Puting it all together!

In [1]:
# Standard imports
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 

In [2]:
heart_disease = pd.read_csv('./data/heart-disease.csv')
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
# Create X (all the feature columns)
X = heart_disease.drop("target", axis=1)

# Create y (the target column)
y = heart_disease["target"]

In [4]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [5]:
y.head(), y.value_counts()

(0    1
 1    1
 2    1
 3    1
 4    1
 Name: target, dtype: int64,
 1    165
 0    138
 Name: target, dtype: int64)

In [8]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((227, 13), (76, 13), (227,), (76,))

2. Choose the model and hyperparameters
##### This is often referred to as model or clf (short for classifier) or estimator (as in the Scikit-Learn) documentation.
##### Hyperparameters are like knobs on an oven you can tune to cook your favourite dish.

In [10]:
 #We'll use a Random Forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [11]:
# We'll leave the hyperparameters as default to begin with...
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

3. Fit the model to the data and use it to make a prediction¶

Fitting the model on the data involves passing it the data and asking it to figure out the patterns.

If there are labels (supervised learning), the model tries to work out the relationship between the data and the labels.

If there are no labels (unsupervised learning), the model tries to find patterns and group similar samples together.

In [12]:
clf.fit(X_train, y_train)

RandomForestClassifier()

In [13]:
# This doesn't work... incorrect shapes
y_label = clf.predict(np.array([0, 2, 3, 4]))

ValueError: Expected 2D array, got 1D array instead:
array=[0. 2. 3. 4.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [14]:
# In order to predict a label, data has to be in the same shape as X_train
X_test.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
50,51,0,2,130,256,0,0,149,0,0.5,2,0,2
136,60,0,2,120,178,1,1,96,0,0.0,2,0,2
21,44,1,2,130,233,0,1,179,1,0.4,2,0,2
172,58,1,1,120,284,0,0,160,0,1.8,1,0,2
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3


In [15]:
# Use the model to make a prediction on the test data (further evaluation)
y_preds = clf.predict(X_test)

In [16]:
# Evaluate the model on the training set
clf.score(X_train, y_train)

1.0

In [17]:
# Evaluate the model on the test set
clf.score(X_test, y_test)

0.7763157894736842

In [18]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.71      0.78      0.75        32
           1       0.83      0.77      0.80        44

    accuracy                           0.78        76
   macro avg       0.77      0.78      0.77        76
weighted avg       0.78      0.78      0.78        76



In [19]:
conf_mat = confusion_matrix(y_test, y_preds)
conf_mat

array([[25,  7],
       [10, 34]])

In [20]:
accuracy_score(y_test, y_preds)

0.7763157894736842

In [21]:
# Try different numbers of estimators (trees)... (no cross-validation)
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    model = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on test set: {model.score(X_test, y_test) * 100}%")
    print("")

Trying model with 10 estimators...
Model accuracy on test set: 72.36842105263158%

Trying model with 20 estimators...
Model accuracy on test set: 84.21052631578947%

Trying model with 30 estimators...
Model accuracy on test set: 82.89473684210526%

Trying model with 40 estimators...
Model accuracy on test set: 81.57894736842105%

Trying model with 50 estimators...
Model accuracy on test set: 84.21052631578947%

Trying model with 60 estimators...
Model accuracy on test set: 81.57894736842105%

Trying model with 70 estimators...
Model accuracy on test set: 82.89473684210526%

Trying model with 80 estimators...
Model accuracy on test set: 82.89473684210526%

Trying model with 90 estimators...
Model accuracy on test set: 85.52631578947368%

