In [1]:
## this notebook demonstrates some of the most usefull function of the beautifull scikit learn library

# 0. An end-to-to scikit learn workflow
# 1. Getting the data ready
# 2. Choose the right estimator/algorithm for our problems
# 3. Fit the model
# 4. Evaluating a model
# 5. Imporve a model
# 6. Save and load a trained model
# 7. Putting it all together **


# ***0. End to end workflow***

In [2]:
import pandas as pd

In [3]:
 heart_disease = pd.read_csv('./heart-disease.csv')

In [4]:
heart_disease.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [5]:
heart_disease.shape

(303, 14)

In [6]:
heart_disease.head(20)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


# ***1. Getting the Data Ready***

In [7]:
# Create X (features matrix)

x = heart_disease.drop("target", axis = 1)
x


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [8]:
# create Y (target)


In [9]:
y = heart_disease['target']
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

# ***2. Choose the Estimator/ Algorithm***

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [11]:
clf = RandomForestClassifier(n_estimators=100)


In [12]:
# We will keep the deafult hypperparamters

In [13]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.8)

In [16]:
x_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
76,51,1,2,125,245,1,0,166,0,2.4,1,0,2
214,56,1,0,125,249,1,0,144,1,1.2,1,1,2
152,64,1,3,170,227,0,0,155,0,0.6,1,0,3
33,54,1,2,125,273,0,0,152,0,0.5,0,1,2
281,52,1,0,128,204,1,1,156,1,1.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,62,0,0,140,394,0,0,157,0,1.2,1,0,2
131,49,0,1,134,271,0,1,162,0,0.0,1,0,2
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
125,34,0,1,118,210,0,1,192,0,0.7,2,0,2


In [17]:
 x_train.shape

(242, 13)

In [18]:
x_test.shape

(61, 13)

In [19]:
clf.fit(x_train, y_train)

In [20]:
#  make predictions
import numpy as np

In [21]:
y_predic = clf.predict(x_test) 

In [22]:
y_predic

array([0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1], dtype=int64)

In [23]:
#  evaluate the model

In [24]:
clf.score(x_train, y_train)

1.0

In [25]:
clf.score(x_test, y_test)

0.8360655737704918

In [26]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [27]:
print(classification_report(y_test, y_predic))

              precision    recall  f1-score   support

           0       0.93      0.76      0.83        33
           1       0.76      0.93      0.84        28

    accuracy                           0.84        61
   macro avg       0.85      0.84      0.84        61
weighted avg       0.85      0.84      0.84        61



In [28]:
 confusion_matrix(y_test, y_predic)

array([[25,  8],
       [ 2, 26]], dtype=int64)

In [29]:
accuracy_score(y_test, y_predic)

0.8360655737704918

# chechking the accuracy for different value of n_estimators

In [30]:
for i in range(10, 200, 10):
    print(f"n_estimators = {i} :")  # Using f-string to format the string with the value of i
    clf = RandomForestClassifier(n_estimators=i)
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    print(f"{score * 100 : .2f}")
    if(score * 100 > 90):
        print(f"Max for {i} : {score}")


n_estimators = 10 :
 85.25
n_estimators = 20 :
 86.89
n_estimators = 30 :
 81.97
n_estimators = 40 :
 80.33
n_estimators = 50 :
 85.25
n_estimators = 60 :
 80.33
n_estimators = 70 :
 83.61
n_estimators = 80 :
 81.97
n_estimators = 90 :
 85.25
n_estimators = 100 :
 83.61
n_estimators = 110 :
 83.61
n_estimators = 120 :
 81.97
n_estimators = 130 :
 81.97
n_estimators = 140 :
 80.33
n_estimators = 150 :
 83.61
n_estimators = 160 :
 81.97
n_estimators = 170 :
 81.97
n_estimators = 180 :
 83.61
n_estimators = 190 :
 81.97


In [31]:
x_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
76,51,1,2,125,245,1,0,166,0,2.4,1,0,2
214,56,1,0,125,249,1,0,144,1,1.2,1,1,2
152,64,1,3,170,227,0,0,155,0,0.6,1,0,3
33,54,1,2,125,273,0,0,152,0,0.5,0,1,2
281,52,1,0,128,204,1,1,156,1,1.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,62,0,0,140,394,0,0,157,0,1.2,1,0,2
131,49,0,1,134,271,0,1,162,0,0.0,1,0,2
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
125,34,0,1,118,210,0,1,192,0,0.7,2,0,2


In [32]:
y_train

76     1
214    0
152    1
33     1
281    0
      ..
96     1
131    1
300    0
125    1
71     1
Name: target, Length: 242, dtype: int64

In [33]:
x_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
183,58,1,2,112,230,0,0,165,0,2.5,1,1,3
30,41,0,1,105,198,0,1,168,0,0.0,2,1,2
172,58,1,1,120,284,0,0,160,0,1.8,1,0,2
92,52,1,2,138,223,0,1,169,0,0.0,2,4,2
288,57,1,0,110,335,0,1,143,1,3.0,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,65,0,2,140,417,1,0,157,0,0.8,2,1,2
117,56,1,3,120,193,0,0,162,0,1.9,1,0,3
120,64,0,0,130,303,0,1,122,0,2.0,1,2,2
209,59,1,0,140,177,0,1,162,1,0.0,2,1,3


In [34]:
y_test

183    0
30     1
172    0
92     1
288    0
      ..
28     1
117    1
120    1
209    0
100    1
Name: target, Length: 61, dtype: int64

### Saving the Model and Load it

In [35]:
import pickle

In [36]:
pickle.dump(clf, open("random_forest_classifier.pkl", 'wb'))

In [37]:
load_clf = pickle.load(open("random_forest_classifier.pkl", 'rb'))

In [38]:
load_clf.score(x_test, y_test)

0.819672131147541

In [39]:
load_clf.score(x_train, y_train)

1.0