In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## 1. Get the data

In [5]:
# Import dataset
heart_disease = pd.read_csv("./data/heart-disease.csv")

# View the data
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [6]:
# Create X (all the feature columns)
X = heart_disease.drop("target", axis=1)

# Create y (the target column)
y = heart_disease["target"]

In [20]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

# View the data shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((227, 13), (76, 13), (227,), (76,))

## 2. Choose the model/estimator

In [21]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

## 3. Fit the model to the data and make a prediction


In [22]:
model.fit(X_train, y_train);

In [23]:
# Make predictions
y_preds = model.predict(X_test)
# This will be in the same format as y_test
y_preds

array([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1])

In [14]:
# You can also predict with probabilities (on classification models)
y_probs = model.predict_proba(X_test)
#View probabilities
y_probs

array([[0.29, 0.71],
       [0.88, 0.12],
       [0.45, 0.55],
       [0.25, 0.75],
       [0.93, 0.07],
       [0.9 , 0.1 ],
       [0.1 , 0.9 ],
       [0.46, 0.54],
       [0.91, 0.09],
       [0.31, 0.69],
       [0.31, 0.69],
       [0.02, 0.98],
       [0.01, 0.99],
       [0.27, 0.73],
       [0.42, 0.58],
       [0.79, 0.21],
       [0.39, 0.61],
       [0.03, 0.97],
       [0.51, 0.49],
       [0.51, 0.49],
       [0.  , 1.  ],
       [0.58, 0.42],
       [0.03, 0.97],
       [0.61, 0.39],
       [0.04, 0.96],
       [0.19, 0.81],
       [0.87, 0.13],
       [0.33, 0.67],
       [0.46, 0.54],
       [0.46, 0.54],
       [0.13, 0.87],
       [0.99, 0.01],
       [0.08, 0.92],
       [0.01, 0.99],
       [0.13, 0.87],
       [0.  , 1.  ],
       [0.1 , 0.9 ],
       [0.51, 0.49],
       [0.92, 0.08],
       [0.62, 0.38],
       [0.61, 0.39],
       [0.93, 0.07],
       [0.37, 0.63],
       [0.09, 0.91],
       [0.69, 0.31],
       [0.57, 0.43],
       [0.68, 0.32],
       [0.92,

In [27]:
X_test.loc[40]

age          51.0
sex           0.0
cp            2.0
trestbps    140.0
chol        308.0
fbs           0.0
restecg       0.0
thalach     142.0
exang         0.0
oldpeak       1.5
slope         2.0
ca            1.0
thal          2.0
Name: 40, dtype: float64

In [30]:
heart_disease.loc[40]

age          51.0
sex           0.0
cp            2.0
trestbps    140.0
chol        308.0
fbs           0.0
restecg       0.0
thalach     142.0
exang         0.0
oldpeak       1.5
slope         2.0
ca            1.0
thal          2.0
target        1.0
Name: 40, dtype: float64

In [31]:
# Make a prediction on a single sample (has to be array)
model.predict(np.array(X_test.loc[40]).reshape(1, -1))



array([1])

## 4. Evaluate the model

In [32]:
# On the training set
model.score(X_train, y_train)

1.0

In [33]:
# On the test set (unseen)
model.score(X_test, y_test)

0.7894736842105263

## 5. Experiment to improve (hyperparameter tuning)

In [34]:
# Try different numbers of estimators (n_estimators is a hyperparameter you can change)
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    model = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accruacy on test set: {model.score(X_test, y_test)}")
    print("")

Trying model with 10 estimators...
Model accruacy on test set: 0.7631578947368421

Trying model with 20 estimators...
Model accruacy on test set: 0.8289473684210527

Trying model with 30 estimators...
Model accruacy on test set: 0.8289473684210527

Trying model with 40 estimators...
Model accruacy on test set: 0.8289473684210527

Trying model with 50 estimators...
Model accruacy on test set: 0.8289473684210527

Trying model with 60 estimators...
Model accruacy on test set: 0.8421052631578947

Trying model with 70 estimators...
Model accruacy on test set: 0.8289473684210527

Trying model with 80 estimators...
Model accruacy on test set: 0.8421052631578947

Trying model with 90 estimators...
Model accruacy on test set: 0.8157894736842105



## 6. Save a model for later use

In [35]:
import pickle

# Save trained model to file
pickle.dump(model, open("./model/random_forest_model_1.pkl", "wb"))

In [40]:
# Load a saved model and make a prediction on a single example
loaded_model = pickle.load(open("./model/random_forest_model_1.pkl", "rb"))
loaded_model.predict(np.array(X_test.loc[40]).reshape(1, -1))



array([1])