# 6. Saving and loading machine learning models:

Two ways to save and load machine learning models:
 1. With python's `pickle` method
 2. With `joblib` module

In [11]:
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV


np.random.seed(42)

heart_disease = pd.read_csv(r"C:\Users\cos_9\PycharmProjects\machine_learning_and_data_science_bootcamp\resources\heart-disease.csv")

heart_disease_shuffled = heart_disease.sample(frac=1)

grid = {
    "n_estimators": [200, 500, 1000, 1200],
    "max_depth": [None, 5, 10, 20, 30],
    "max_features": ["auto", "sqrt"],
    "min_samples_split": [2, 4, 6],
    "min_samples_leaf": [1, 2, 4]
}

X = heart_disease_shuffled.drop("target", axis=1)
y = heart_disease_shuffled["target"]

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

clf = RandomForestClassifier(n_jobs=1)

# Setup RandomisedSearchCV

rs_clf = RandomizedSearchCV(estimator=clf,
                           param_distributions=grid,
                           n_iter=10,
                           cv=5,
                           verbose=2)

rs_clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=1200; total time=   1.0s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=1200; total time=   1.0s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=1200; total time=   1.0s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=1200; total time=   1.0s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=1200; total time=   1.1s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   0.8s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   0.8s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimat

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=1),
                   param_distributions={'max_depth': [None, 5, 10, 20, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 4, 6],
                                        'n_estimators': [200, 500, 1000, 1200]},
                   verbose=2)

In [12]:
def evaluate_preds(y_true, y_preds):
    """
    Performs evaluation comparison on y_true labels vs. y_pred labels on a classification
    """
    
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Precision: {precision * 100:.2f}%")
    print(f"Recall: {recall * 100:.2f}%")
    print(f"F1: {f1 * 100:.2f}%")
    return {"accuracy": round(accuracy, 3), "precision": round(precision, 3), "recall": round(recall, 3), "f1": round(f1, 3)}

In [13]:
# save the model using dump
pickle.dump(rs_clf, open("gs_random_forest_model_1.pkl", "wb"))

In [14]:
# loading the saved model
loaded_pickle_model = pickle.load(open("gs_random_forest_model_1.pkl", "rb"))

In [15]:
pickle_y_preds = loaded_pickle_model.predict(X_test)
evaluate_preds(y_test, pickle_y_preds)


Acc: 82.42%
Precision: 81.82%
Recall: 88.24%
F1: 84.91%


{'accuracy': 0.824, 'precision': 0.818, 'recall': 0.882, 'f1': 0.849}

In [16]:
from joblib import dump, load

dump(rs_clf, filename="dump_random_forest_model_1.joblib")

['dump_random_forest_model_1.joblib']

In [None]:
loaded_joblib_model = load(filename="dump_random_forest_model_1.joblib")

In [17]:
joblib_y_preds = loaded_pickle_model.predict(X_test)
evaluate_preds(y_test, pickle_y_preds)

Acc: 82.42%
Precision: 81.82%
Recall: 88.24%
F1: 84.91%


{'accuracy': 0.824, 'precision': 0.818, 'recall': 0.882, 'f1': 0.849}