# Ridge Regression

In [12]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

# data preprocessing steps
df = pd.read_csv("housing.csv")
# if 'total_bedrooms' None equals to zero
for i in df[df.total_bedrooms.isnull()].index:
    df.iloc[i,4]=0
# OneHotEncoding
df = pd.get_dummies(df,columns=["ocean_proximity"])
X = df.drop("median_house_value", axis=1)
y = df[["median_house_value"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=0)


In [16]:
# Pred
r_first = Ridge().fit(X_train, y_train)
mean_squared_error(y_test, r_first.predict(X_test)) ** .5

68816.45668468832

In [13]:
# Model Tuning

In [17]:
alphas = 10**np.linspace(10,-2,50)
alphas

array([1.00000000e+10, 5.68986603e+09, 3.23745754e+09, 1.84206997e+09,
       1.04811313e+09, 5.96362332e+08, 3.39322177e+08, 1.93069773e+08,
       1.09854114e+08, 6.25055193e+07, 3.55648031e+07, 2.02358965e+07,
       1.15139540e+07, 6.55128557e+06, 3.72759372e+06, 2.12095089e+06,
       1.20679264e+06, 6.86648845e+05, 3.90693994e+05, 2.22299648e+05,
       1.26485522e+05, 7.19685673e+04, 4.09491506e+04, 2.32995181e+04,
       1.32571137e+04, 7.54312006e+03, 4.29193426e+03, 2.44205309e+03,
       1.38949549e+03, 7.90604321e+02, 4.49843267e+02, 2.55954792e+02,
       1.45634848e+02, 8.28642773e+01, 4.71486636e+01, 2.68269580e+01,
       1.52641797e+01, 8.68511374e+00, 4.94171336e+00, 2.81176870e+00,
       1.59985872e+00, 9.10298178e-01, 5.17947468e-01, 2.94705170e-01,
       1.67683294e-01, 9.54095476e-02, 5.42867544e-02, 3.08884360e-02,
       1.75751062e-02, 1.00000000e-02])

In [18]:
r_cv = RidgeCV(alphas=alphas, scoring="neg_mean_squared_error", 
               cv=10, normalize=True)
r_cv.fit(X_train, y_train)

RidgeCV(alphas=array([1.00000000e+10, 5.68986603e+09, 3.23745754e+09, 1.84206997e+09,
       1.04811313e+09, 5.96362332e+08, 3.39322177e+08, 1.93069773e+08,
       1.09854114e+08, 6.25055193e+07, 3.55648031e+07, 2.02358965e+07,
       1.15139540e+07, 6.55128557e+06, 3.72759372e+06, 2.12095089e+06,
       1.20679264e+06, 6.86648845e+05, 3.90693994e+05, 2.22299648e+05,
       1.26485522e+05, 7.19685673e+0...
       1.38949549e+03, 7.90604321e+02, 4.49843267e+02, 2.55954792e+02,
       1.45634848e+02, 8.28642773e+01, 4.71486636e+01, 2.68269580e+01,
       1.52641797e+01, 8.68511374e+00, 4.94171336e+00, 2.81176870e+00,
       1.59985872e+00, 9.10298178e-01, 5.17947468e-01, 2.94705170e-01,
       1.67683294e-01, 9.54095476e-02, 5.42867544e-02, 3.08884360e-02,
       1.75751062e-02, 1.00000000e-02]),
        cv=10, normalize=True, scoring='neg_mean_squared_error')

In [19]:
r_cv.alpha_

0.01

In [20]:
r_reg = Ridge(alpha=0.01)
r_reg.fit(X_train, y_train)
y_pred_Ridge = r_reg.predict(X_test)

In [23]:
# Optimized RMSE SCORE
mean_squared_error(y_test, y_pred_Ridge) ** .5

68814.62223481952