# Preserving the model


In [None]:
import sys

# adds everything in the directory above to the path
sys.path.insert(0, '../')

In [None]:
# autoreload all libraries/modules
%load_ext autoreload
%autoreload 2

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import numpy as np

In [None]:
from sklearn.metrics import get_scorer_names

## Recap: California Housing Data

This is the model you created in the Cross Validation assignment. This section will serve as a brief recap.

### Import Data & Separate Features & Targets

What is the target for the California housing data?

* [The California housing dataset — Scikit-learn course](https://inria.github.io/scikit-learn-mooc/python_scripts/datasets_california_housing.html) 




</br>
<details>
<summary>Solution</summary>

The target contains the median of the house value for each district.

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

```python
print(california.DESCR)
```
</details>
</br>



In [None]:
# Fetch dataset from sklearn's internal datasets
california = fetch_california_housing(as_frame=True)

# # Features for dataset
X = california['data']
# print(X)

# # Target for dataset
y = california['target']
# print(y)

### Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25)

### Fit model

In [None]:
# # Fit your model using the training set
reg = LinearRegression()
reg.fit(X_train, y_train)

### Get Predictions

In [None]:
# # Call predict to get the predicted values for training and test set
train_predicted = reg.predict(X_train)
test_predicted = reg.predict(X_test)

## Scoring Predictions

### What is the default scoring for this model?


Default scoring:

* [LinearRegression — scikit-learn 1.5.1
  documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)

</br>
<details>
<summary>Solution</summary>

The default score for the Linear Regression model is R^2

</details>
</br>



In [None]:
# Default scoring
print(
    f"Training: score {reg.score(X_train, y_train)}",
    f"Testing : score {reg.score(X_test, y_test)}",
    sep="\n")

### Calculate RMSE

* [root_mean_squared_error — scikit-learn 1.5.1
  documentation](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.root_mean_squared_error.html)

This comes out in 1.4, and the version in my anaconda is 1.2

* [sklearn.metrics.mean_squared_error — scikit-learn 1.2.2 documentation](https://scikit-learn.org/1.2/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn-metrics-mean-squared-error) 

This is available in 1.2

In [None]:
# Calculate RMSE for training and test set
print(
    'RMSE for training set ',
    np.sqrt(mean_squared_error(
        y_true=y_train,
        y_pred=train_predicted)))

print(
    'RMSE for test set     ',
    np.sqrt(mean_squared_error(
        y_true=y_test,
        y_pred=test_predicted)))

## Cross validation score

* [cross_val_score — scikit-learn 1.5.1
  documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html) 


Note the number of cross validations (in this case 4)

In [None]:
cross_val_score(reg, X, y, cv=4)

These seem different. What scorer was used?

* [cross_val_score — scikit-learn 1.5.1
  documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html) 


</br>
<details>
<summary>Solution</summary>


Note the default scorer is the one used for the model (R^2 in this case).

</details>
</br>

In [None]:
# what other scorers are available?
# from sklearn.metrics import get_scorer_names
print(get_scorer_names())

Note that adjusted R-squared is not listed. There's a side quest here if
you would like to explore it.

* [Adjusted R-squared
  exercise](https://jovian.ai/edwardcashmere/sk-learn-regression-adjusted-r2)

In [None]:
# # Cross Validation score
cvs = np.abs(cross_val_score(
    reg, X, y, cv=4, scoring='neg_root_mean_squared_error'))

print(f"Cross Validation Results: {cvs}")
print(f"Mean Cross Validation Results: {np.mean(cvs)}")

## Committing to a model

Once you have committed to a model design, you'll typically train it with
all the data you have available.

In [None]:
final_model = LinearRegression()
final_model.fit(X, y)

## Preserving the model

* [Model persistence | SKLearn User
  Guide](https://scikit-learn.org/stable/model_persistence.html)
* [pickle, joblib and
  cloudpickle](https://scikit-learn.org/stable/model_persistence.html#pickle-joblib-and-cloudpickle) 
* [cloudpipe/cloudpickle: Extended pickling support for Python
  objects](https://github.com/cloudpipe/cloudpickle)
* [pickle — Python object serialization — Python 3.12.4
  documentation](https://docs.python.org/3/library/pickle.html#module-pickle) 
* [Joblib: running Python functions as pipeline jobs — joblib 1.5.dev0
  documentation](https://joblib.readthedocs.io/en/latest/index.html#module-joblib) 


A good practice is to use the library name as the file extension. This
helps users know how to load the library.




In [None]:
# choose one. joblib tends to be more efficient with larger models
import pickle
import joblib
import cloudpickle

### Pickle

In [None]:
# Here you can replace pickle with joblib or cloudpickle
with open("../models/final_model.pickle", "wb") as f:

    pickle.dump(final_model, f, protocol=5)

In [None]:
# Here you can replace pickle with joblib or cloudpickle
with open("../models/final_model.pickle", "rb") as f:

    restored_model = pickle.load(f)

In [None]:
# Single observation
X.iloc[[0]]

In [None]:
print(
    final_model.predict(X.iloc[[0]]),
    y.iloc[0])

In [None]:
restored_model.predict(X.iloc[[0]])

### Joblib

In [None]:
# joblib
with open("../models/final_model.joblib", "wb") as f:

    joblib.dump(final_model, f, protocol=5)

In [None]:
# Here you can replace pickle with joblib or cloudpickle
with open("../models/final_model.joblib", "rb") as f:

    restored_joblib = joblib.load(f)

In [None]:
restored_joblib.predict(X.iloc[[0]])

### Cloud pickle


In [None]:
with open("../models/final_model.cloudpickle", "wb") as f:

    cloudpickle.dump(final_model, f, protocol=5)

In [None]:
# Here you can replace pickle with joblib or cloudpickle
with open("../models/final_model.cloudpickle", "rb") as f:

    restored_cloudpickle = cloudpickle.load(f)

In [None]:
restored_cloudpickle.predict(X.iloc[[0]])

## Getting a single observation for later testing

In [None]:
X.columns

In [None]:
restored_model.feature_names_in_

In [None]:
X.iloc[[0]].values