<div style="padding:30px 0px;">
    <h1 align="center" style="padding:50px">Saving Sklearn Models</h1>
    <p align="center" style="font-size:small;">Seth Pruitt<br>spruitt@norstal.com<br>www.github.com/faradical</p>
</div>

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

## Generating Synthetic Data

In [2]:
import numpy as np
import random as rn

# Defining a polynomial
def y_f(x):
    return (0.5 * (x**2)) + (2 * x) + 6

# Building X an Y variables with noise on Y
X = np.arange(222)
y = [y_f(x)+rn.randint(-2000,2000) for x in X]

# Split the data into traing and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Creating our Model

In [3]:
# Creating a reshape function
def reshape(X):
    return X.reshape(-1, 1)

# Defining a new pipeline
steps = [
    ("Reshape", FunctionTransformer(reshape)),
    ("Scaler", StandardScaler()),
    ("PolynomialFeatures", PolynomialFeatures()),
    ("Linear Regression", LinearRegression())
]
p3 = Pipeline(steps)

In [4]:
# Define a dictionary of hyperparameters for both the model and pipeline steps.
param_grid = {
    'Scaler__with_mean': [True, False],
    'Scaler__with_std': [True, False],
    'PolynomialFeatures__degree': np.arange(1,4),
    'PolynomialFeatures__interaction_only': [True, False],
    'PolynomialFeatures__include_bias': [True, False],
    'PolynomialFeatures__order': ['C', 'F'],
    'Linear Regression__copy_X': [True, False],
    'Linear Regression__fit_intercept': [True, False]
}

# Perform a grid search with cross-validation
grid = GridSearchCV(p3, param_grid, cv=5)

# Fit and score the pipeline
grid.fit(X_train, y_train)
grid.score(X_test, y_test)

0.9613557163269432

## Saving the Model

In [5]:
grid.best_estimator_

In [6]:
p = grid.best_estimator_

p.named_steps

{'Reshape': FunctionTransformer(func=<function reshape at 0x0000018E28A285E0>),
 'Scaler': StandardScaler(with_mean=False),
 'PolynomialFeatures': PolynomialFeatures(degree=3, include_bias=False),
 'Linear Regression': LinearRegression(fit_intercept=False)}

In [7]:
model = p.named_steps['Linear Regression']

### 1) Saving the model using Joblib

In [8]:
from joblib import dump

# Save the model to a file
dump(model, 'model.joblib')
dump(model, 'MyModule/model.joblib')

['MyModule/model.joblib']

### 2) Saving the model using Pickle

In [9]:
import pickle

# Save the model to a file
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

### 3) Saving the other pipeline steps

In [11]:
for step in p.named_steps:

    print(f"Saving {step}")
    with open(f'{step}.pkl', 'wb') as file:
        pickle.dump(p.named_steps[step], file)

    with open(f'MyModule/{step}.pkl', 'wb') as file:
        pickle.dump(p.named_steps[step], file)

    dump(p.named_steps[step], f'{step}.joblib')

Saving Reshape
Saving Scaler
Saving PolynomialFeatures
Saving Linear Regression
