# Introduction to Scikit-Learn (SKLearn)

This notebook demonstrates some of the most useful functions of the beautiful Sci-Kit Learn Library. 

What we're going to cover:

0. An end-to-end Scikit-Learn Workflow
1. Getting the data ready
2. Choose the right estimator (model, algorithm)
3. Fit the model/estimator/algorithm and use it to make predictions on the data
4. Evaluating a model
5. Improve a model
6. Save and load a trained model
7. Put it all together


In [34]:
# Listify the Contents of what we're learning... 
what_we_are_covering = [
    "0. An end-to-end Scikit-Learn Workflow", 
    "1. Getting the data ready",
    "2. Choose the right estimator (model, algorithm)",
    "3. Fit the model/estimator/algorithm and use it to make predictions on the data",
    "4. Evaluating a model",
    "5. Improve a model",
    "6. Save and load a trained model",
    "7. Put it all together",
];

# 0. An End-To-End Scikit-Learn Workflow


In [35]:
#1. Get the data ready

# Standar Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease, heart_disease[:5]

(     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
 0     63    1   3       145   233    1        0      150      0      2.3   
 1     37    1   2       130   250    0        1      187      0      3.5   
 2     41    0   1       130   204    0        0      172      0      1.4   
 3     56    1   1       120   236    0        1      178      0      0.8   
 4     57    0   0       120   354    0        1      163      1      0.6   
 ..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
 298   57    0   0       140   241    0        1      123      1      0.2   
 299   45    1   3       110   264    0        1      132      0      1.2   
 300   68    1   0       144   193    1        1      141      0      3.4   
 301   57    1   0       130   131    0        1      115      1      1.2   
 302   57    0   1       130   236    0        0      174      0      0.0   
 
      slope  ca  thal  target  
 0        0   0     1       1  
 1        

In [36]:
# Create X, which is known as the Features Matrix / Data / Features Variable
X = heart_disease.drop("target", axis = 1)

# Create Y, which is known as the Labels Matrix, Labels
Y = heart_disease["target"]

In [37]:
# 2. Choose the Right Model / Estimator / Algorithm  and Hyperparmaters (the dials on the Model to make it better/worse)
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# We'll use the default Hyperparameters
clf.get_params()



{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [38]:
# 3. Fit the model to the Training Data

from sklearn.model_selection import train_test_split

X_train, X_test, Y_Train, Y_test = train_test_split(X,Y, test_size= 0.2)

In [39]:
len(X_test), len(X_train), len(Y_Train), len(Y_test)

(61, 242, 242, 61)

In [40]:
clf.fit(X_train, Y_Train)

RandomForestClassifier()

# Make a prediction

y_label = clf.predict(np.array([0,2,3,4]))

In [41]:
y_preds = clf.predict(X_test)
y_preds

array([0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=int64)

In [42]:
Y_test

226    0
198    0
247    0
117    1
194    0
      ..
200    0
11     1
275    0
101    1
97     1
Name: target, Length: 61, dtype: int64

In [43]:
# 4. Evaluate the model on the training data and test data
clf.score(X_train, Y_Train)

1.0

In [44]:
clf.score(X_test, Y_test)

0.7868852459016393

In [45]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


print(classification_report(Y_test, y_preds))

              precision    recall  f1-score   support

           0       0.83      0.69      0.75        29
           1       0.76      0.88      0.81        32

    accuracy                           0.79        61
   macro avg       0.80      0.78      0.78        61
weighted avg       0.79      0.79      0.78        61



In [46]:
confusion_matrix(Y_test, y_preds)

array([[20,  9],
       [ 4, 28]], dtype=int64)

In [47]:
accuracy_score(Y_test, y_preds)

0.7868852459016393

In [48]:
# 5. Improve the model
# Try different amount of n_estimators

np.random.seed(42)

for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i)
    clf.fit(X_train, Y_Train)
    print(f"Model accuracy on test set: {clf.score(X_test, Y_test) * 100:.2f}%")
    print("")


Trying model with 10 estimators...
Model accuracy on test set: 78.69%

Trying model with 20 estimators...
Model accuracy on test set: 80.33%

Trying model with 30 estimators...
Model accuracy on test set: 83.61%

Trying model with 40 estimators...
Model accuracy on test set: 80.33%

Trying model with 50 estimators...
Model accuracy on test set: 78.69%

Trying model with 60 estimators...
Model accuracy on test set: 83.61%

Trying model with 70 estimators...
Model accuracy on test set: 80.33%

Trying model with 80 estimators...
Model accuracy on test set: 81.97%

Trying model with 90 estimators...
Model accuracy on test set: 81.97%



In [49]:
#6. Save a model and load it
import pickle

pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))

In [50]:
loaded_clf = pickle.load((open("random_forest_model_1.pkl", "rb")))
loaded_clf.score(X_test, Y_test)

0.819672131147541

In [51]:
adjusted_model = RandomForestClassifier(n_estimators=60)
adjusted_model.fit(X_train, Y_Train)

adjusted_model.score(X_train, Y_Train)

1.0

In [52]:
adjusted_model.score(X_test, Y_test)

0.819672131147541

In [53]:
import sklearn
sklearn.show_versions()


System:
    python: 3.8.11 (default, Aug  6 2021, 09:57:55) [MSC v.1916 64 bit (AMD64)]
executable: C:\Projects\first-machine-learning-project\env\python.exe
   machine: Windows-10-10.0.19043-SP0

Python dependencies:
          pip: 21.2.2
   setuptools: 52.0.0.post20210125
      sklearn: 0.24.2
        numpy: 1.19.2
        scipy: 1.6.2
       Cython: None
       pandas: 1.2.3
   matplotlib: 3.3.4
       joblib: 1.0.1
threadpoolctl: 2.2.0

Built with OpenMP: True


In [54]:
sklearn.show_versions()


System:
    python: 3.8.11 (default, Aug  6 2021, 09:57:55) [MSC v.1916 64 bit (AMD64)]
executable: C:\Projects\first-machine-learning-project\env\python.exe
   machine: Windows-10-10.0.19043-SP0

Python dependencies:
          pip: 21.2.2
   setuptools: 52.0.0.post20210125
      sklearn: 0.24.2
        numpy: 1.19.2
        scipy: 1.6.2
       Cython: None
       pandas: 1.2.3
   matplotlib: 3.3.4
       joblib: 1.0.1
threadpoolctl: 2.2.0

Built with OpenMP: True


In [55]:
what_we_are_covering

['0. An end-to-end Scikit-Learn Workflow',
 '1. Getting the data ready',
 '2. Choose the right estimator (model, algorithm)',
 '3. Fit the model/estimator/algorithm and use it to make predictions on the data',
 '4. Evaluating a model',
 '5. Improve a model',
 '6. Save and load a trained model',
 '7. Put it all together']

# 1. Getting the Data Ready to be used with machine learning


Three main things we have to do to get the data ready are:

    1. Split the data into features and lables (usually known as `X` & `y`)
    2. Filling (also called imputing) or disregarding missing values
    3. Converting non-numeric values to numeric values (also called Feature Encoding)

In [56]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [57]:
X = heart_disease.drop("target", axis = 1);
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [58]:
y = heart_disease["target"];
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [59]:
len(X), len(y)

(303, 303)

In [60]:
# Split the Data into Training and Test Sets... 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

In [61]:
X_train.shape, X_test.shape, y_test.shape, y_test.shape

((151, 13), (152, 13), (152,), (152,))

In [62]:
X.shape, heart_disease.shape

((303, 13), (303, 14))

In [63]:
car_sales = pd.read_csv("data/car-sales.csv")

In [64]:
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [65]:
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"


In [66]:
X = car_sales.drop("Price", axis = 1)

In [67]:
y = car_sales["Price"]

In [68]:
X, y

(     Make Colour  Odometer (KM)  Doors
 0  Toyota  White         150043      4
 1   Honda    Red          87899      4
 2  Toyota   Blue          32549      3
 3     BMW  Black          11179      5
 4  Nissan  White         213095      4
 5  Toyota  Green          99213      4
 6   Honda   Blue          45698      4
 7   Honda   Blue          54738      4
 8  Toyota  White          60000      4
 9  Nissan  White          31600      4,
 0     $4,000.00
 1     $5,000.00
 2     $7,000.00
 3    $22,000.00
 4     $3,500.00
 5     $4,500.00
 6     $7,500.00
 7     $7,000.00
 8     $6,250.00
 9     $9,700.00
 Name: Price, dtype: object)

In [69]:
X.shape, y.shape, car_sales.shape

((10, 4), (10,), (10, 5))

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [71]:
X_train, X_test

(     Make Colour  Odometer (KM)  Doors
 1   Honda    Red          87899      4
 6   Honda   Blue          45698      4
 9  Nissan  White          31600      4
 2  Toyota   Blue          32549      3
 7   Honda   Blue          54738      4
 5  Toyota  Green          99213      4
 4  Nissan  White         213095      4
 0  Toyota  White         150043      4,
      Make Colour  Odometer (KM)  Doors
 8  Toyota  White          60000      4
 3     BMW  Black          11179      5)

In [72]:
X_train.shape, X_test.shape

((8, 4), (2, 4))

In [73]:
clf.fit(X_train, y_mtrain)

ValueError: could not convert string to float: 'Honda'

In [76]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder();
print(label_encoder.fit_transform(car_sales["Make"]))

[3 1 3 0 2 3 1 1 3 2]


In [77]:
print(label_encoder.fit_transform(car_sales["Colour"]))

[4 3 1 0 4 2 1 1 4 4]
