In [1]:
### 
# this notebook demonstrates some of the most usefull function of the beautifull scikit learn library

# 0. An end-to-to scikit learn workflow
# 1. Getting the data ready
# 2. Choose the right estimator/algorithm for our problems
# 3. Fit the model
# 4. Evaluating a model
# 5. Imporve a model
# 6. Save and load a trained model
# 7. Putting it all together

###

# ***0. End to end workflow***

In [2]:
import pandas as pd

In [3]:
 heart_disease = pd.read_csv('./heart-disease.csv')

In [4]:
heart_disease.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [5]:
heart_disease.shape

(303, 14)

In [6]:
heart_disease.head(20)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


# ***1. Getting the Data Ready***

In [7]:
# Create X (features matrix)

x = heart_disease.drop("target", axis = 1)
x


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [8]:
# create Y (target)


In [9]:
y = heart_disease['target']
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

# ***2. Choose the Estimator/ Algorithm***

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [11]:
clf = RandomForestClassifier(n_estimators=100)


In [12]:
# We will keep the deafult hypperparamters

In [13]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.8)

In [16]:
x_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
199,65,1,0,110,248,0,0,158,0,0.6,2,2,1
231,57,1,0,165,289,1,0,124,0,1.0,1,3,3
181,65,0,0,150,225,0,0,114,0,1.0,1,3,3
288,57,1,0,110,335,0,1,143,1,3.0,1,1,3
156,47,1,2,130,253,0,1,179,0,0.0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
271,61,1,3,134,234,0,1,145,0,2.6,1,2,2
93,54,0,1,132,288,1,0,159,1,0.0,2,1,2
166,67,1,0,120,229,0,0,129,1,2.6,1,2,3
12,49,1,1,130,266,0,1,171,0,0.6,2,0,2


In [17]:
 x_train.shape

(242, 13)

In [18]:
x_test.shape

(61, 13)

In [19]:
clf.fit(x_train, y_train)

In [20]:
#  make predictions
import numpy as np

In [21]:
y_predic = clf.predict(x_test) 

In [22]:
y_predic

array([1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1], dtype=int64)

In [23]:
#  evaluate the model

In [24]:
clf.score(x_train, y_train)

1.0

In [25]:
clf.score(x_test, y_test)

0.819672131147541

In [26]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [27]:
print(classification_report(y_test, y_predic))

              precision    recall  f1-score   support

           0       0.81      0.79      0.80        28
           1       0.82      0.85      0.84        33

    accuracy                           0.82        61
   macro avg       0.82      0.82      0.82        61
weighted avg       0.82      0.82      0.82        61



In [28]:
 confusion_matrix(y_test, y_predic)

array([[22,  6],
       [ 5, 28]], dtype=int64)

In [29]:
accuracy_score(y_test, y_predic)

0.819672131147541

# chechking the accuracy for different value of n_estimators

In [30]:
for i in range(10, 200, 10):
    print(f"n_estimators = {i} :")  # Using f-string to format the string with the value of i
    clf = RandomForestClassifier(n_estimators=i)
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    print(f"{score * 100 : .2f}")
    if(score * 100 > 90):
        print(f"Max for {i} : {score}")


n_estimators = 10 :
 78.69
n_estimators = 20 :
 78.69
n_estimators = 30 :
 80.33
n_estimators = 40 :
 77.05
n_estimators = 50 :
 75.41
n_estimators = 60 :
 78.69
n_estimators = 70 :
 80.33
n_estimators = 80 :
 80.33
n_estimators = 90 :
 80.33
n_estimators = 100 :
 78.69
n_estimators = 110 :
 81.97
n_estimators = 120 :
 81.97
n_estimators = 130 :
 80.33
n_estimators = 140 :
 78.69
n_estimators = 150 :
 78.69
n_estimators = 160 :
 80.33
n_estimators = 170 :
 81.97
n_estimators = 180 :
 81.97
n_estimators = 190 :
 80.33


In [31]:
x_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
199,65,1,0,110,248,0,0,158,0,0.6,2,2,1
231,57,1,0,165,289,1,0,124,0,1.0,1,3,3
181,65,0,0,150,225,0,0,114,0,1.0,1,3,3
288,57,1,0,110,335,0,1,143,1,3.0,1,1,3
156,47,1,2,130,253,0,1,179,0,0.0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
271,61,1,3,134,234,0,1,145,0,2.6,1,2,2
93,54,0,1,132,288,1,0,159,1,0.0,2,1,2
166,67,1,0,120,229,0,0,129,1,2.6,1,2,3
12,49,1,1,130,266,0,1,171,0,0.6,2,0,2


In [32]:
y_train

199    0
231    0
181    0
288    0
156    1
      ..
271    0
93     1
166    0
12     1
158    1
Name: target, Length: 242, dtype: int64

In [33]:
x_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
131,49,0,1,134,271,0,1,162,0,0.0,1,0,2
65,35,0,0,138,183,0,1,182,0,1.4,2,0,2
59,57,0,0,128,303,0,0,159,0,0.0,2,1,2
51,66,1,0,120,302,0,0,151,0,0.4,1,0,2
46,44,1,2,140,235,0,0,180,0,0.0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,65,1,3,138,282,1,0,174,0,1.4,1,1,2
95,53,1,0,142,226,0,0,111,1,0.0,2,0,3
253,67,1,0,100,299,0,0,125,1,0.9,1,2,2
92,52,1,2,138,223,0,1,169,0,0.0,2,4,2


In [34]:
y_test

131    1
65     1
59     1
51     1
46     1
      ..
222    0
95     1
253    0
92     1
57     1
Name: target, Length: 61, dtype: int64

### Saving the Model and Load it

In [35]:
import pickle

In [36]:
pickle.dump(clf, open("random_forest_classifier.pkl", 'wb'))

In [37]:
load_clf = pickle.load(open("random_forest_classifier.pkl", 'rb'))

In [38]:
load_clf.score(x_test, y_test)

0.8032786885245902

In [39]:
load_clf.score(x_train, y_train)

1.0

In [40]:
heart_disease.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

# ***Make sure Data is all Numericals***

In [52]:
sales = pd.read_csv('./car-sales.csv')

In [53]:
sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [54]:
sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price            object
dtype: object

In [55]:
sales['Price'] = sales['Price'].str.replace('[$,.]' , '', regex = True).astype('int64')

In [56]:
sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,400000
1,Honda,Red,87899,4,500000
2,Toyota,Blue,32549,3,700000
3,BMW,Black,11179,5,2200000
4,Nissan,White,213095,4,350000
5,Toyota,Green,99213,4,450000
6,Honda,Blue,45698,4,750000
7,Honda,Blue,54738,4,700000
8,Toyota,White,60000,4,625000
9,Nissan,White,31600,4,970000


In [57]:
sales['Price'] = sales['Price'] // 100

In [58]:
sales


Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,4000
1,Honda,Red,87899,4,5000
2,Toyota,Blue,32549,3,7000
3,BMW,Black,11179,5,22000
4,Nissan,White,213095,4,3500
5,Toyota,Green,99213,4,4500
6,Honda,Blue,45698,4,7500
7,Honda,Blue,54738,4,7000
8,Toyota,White,60000,4,6250
9,Nissan,White,31600,4,9700


In [60]:
sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [61]:
car_sales = pd.read_csv('./car-sales-extended.csv')

In [62]:
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
...,...,...,...,...,...
995,Toyota,Black,35820,4,32042
996,Nissan,White,155144,3,5716
997,Nissan,Blue,66604,4,31570
998,Honda,White,215883,4,4001


In [63]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [65]:
# Turn the categories into number
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [66]:
x = car_sales.drop('Price', axis=1)

In [67]:
x

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3
...,...,...,...,...
995,Toyota,Black,35820,4
996,Nissan,White,155144,3
997,Nissan,Blue,66604,4
998,Honda,White,215883,4


In [79]:
y = car_sales['Price']

In [70]:
categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                   one_hot,
                                   categorical_features)],
                                 remainder='passthrough')
transformed_x = transformer.fit_transform(x)
transformed_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [73]:
pd.DataFrame(transformed_x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [77]:
dummies = pd.get_dummies(car_sales[['Make', 'Colour', 'Doors']])
dummies.astype('int64')

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,1,1,0,0,0,0
996,3,0,0,1,0,0,0,0,0,1
997,4,0,0,1,0,0,1,0,0,0
998,4,0,1,0,0,0,0,0,0,1


In [75]:
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,False,True,False,False,False,False,False,False,True
1,5,True,False,False,False,False,True,False,False,False
2,4,False,True,False,False,False,False,False,False,True
3,4,False,False,False,True,False,False,False,False,True
4,3,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...
995,4,False,False,False,True,True,False,False,False,False
996,3,False,False,True,False,False,False,False,False,True
997,4,False,False,True,False,False,True,False,False,False
998,4,False,True,False,False,False,False,False,False,True


In [80]:
x_train, x_test, y_train, y_test = train_test_split(transformed_x,
                                                   y, 
                                                   train_size=0.8)

In [81]:
from sklearn.ensemble import RandomForestRegressor

In [82]:
model = RandomForestRegressor()

In [83]:
model.fit(x_train, y_train)

In [84]:
model.score(x_train, y_train)

0.8976047316237048

In [85]:
model.score(x_test, y_test)

0.24139281457524164

In [86]:
car_sales_missing = pd.read_csv('./car-sales-extended-missing-data.csv')

In [87]:
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [88]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64