## **Ridge regression**

Import useful libraries

In [1]:
from src.utils import regression_metrics

import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge


In [2]:
import warnings
warnings.filterwarnings(action="ignore")

### **3. Preparing the Data**

In [3]:
%store -r df

In [4]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


**Identify predictors and response variable**

In [5]:
predictors_list = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin']

In [6]:
X = df[predictors_list].values
y = df[['mpg']].values

**Split data in training set and test set**

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [8]:
X_train.shape, X_test.shape

((333, 7), (59, 7))

### **4. Training the Model**

Ridge regression model fit on data

In [9]:
ridge_regression = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("polynomial", PolynomialFeatures(degree=1)),
        ("regressor", Ridge(random_state=42))
    ])

ridge_regression.fit(X_train, y_train)

In [10]:
y_pred = ridge_regression.predict(X_test)

### **5. Evaluating the Model**

**Computation of regression metrics**

In [11]:
regression_metrics(y_true=y_test, y_pred=y_pred)

Mean absolute error: 2.24
Median absolute error: 1.98
Mean squared error: 8.73
Root mean squared error: 2.96
R2 score: 0.85


---

### **6. Parameter Tuning**

In [12]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
ridge_regression_cv_scores = cross_val_score(estimator=ridge_regression, X=X_train, y=y_train, scoring="r2", cv=kfold, n_jobs=-1)

In [13]:
print(f"Mean CV accuracy: {ridge_regression_cv_scores.mean():.2f} +/- {ridge_regression_cv_scores.std():.2f}")

Mean CV accuracy: 0.80 +/- 0.03


In [14]:
ridge_regression.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()),
  ('polynomial', PolynomialFeatures(degree=1)),
  ('regressor', Ridge(random_state=42))],
 'verbose': False,
 'scaler': StandardScaler(),
 'polynomial': PolynomialFeatures(degree=1),
 'regressor': Ridge(random_state=42),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'polynomial__degree': 1,
 'polynomial__include_bias': True,
 'polynomial__interaction_only': False,
 'polynomial__order': 'C',
 'regressor__alpha': 1.0,
 'regressor__copy_X': True,
 'regressor__fit_intercept': True,
 'regressor__max_iter': None,
 'regressor__positive': False,
 'regressor__random_state': 42,
 'regressor__solver': 'auto',
 'regressor__tol': 0.0001}

In [15]:
params = {
    "regressor__alpha": list(np.arange(0,1000)),
    "polynomial__interaction_only" : [False, True],
    "polynomial__degree" : list(np.arange(0,10))
}
rs = RandomizedSearchCV(estimator=ridge_regression, param_distributions=params, scoring="r2", n_jobs=-1, cv=kfold, n_iter=500, random_state=42)
rs.fit(X=X_train, y=y_train)

In [16]:
rs.best_score_

0.8612602280895872

In [17]:
rs.best_params_

{'regressor__alpha': 7,
 'polynomial__interaction_only': False,
 'polynomial__degree': 2}

In [18]:
best_pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("polynomial", PolynomialFeatures(degree=rs.best_params_['polynomial__degree'], 
                                          interaction_only=rs.best_params_['polynomial__interaction_only'])),
        ("regressor", Ridge(random_state=42, alpha=rs.best_params_['regressor__alpha']))
    ]
)
best_pipeline.fit(X=X_train, y=y_train)

In [19]:
y_best_pred = best_pipeline.predict(X_test)

In [20]:
regression_metrics(y_true=y_test, y_pred=y_best_pred)

Mean absolute error: 1.71
Median absolute error: 1.2
Mean squared error: 5.2
Root mean squared error: 2.28
R2 score: 0.91


### **7. Model deployment**

Model serialization


In [22]:
joblib.dump(value=best_pipeline, filename="models/autoMPG_best_pipeline.joblib")

['models/autoMPG_best_pipeline.joblib']