# Ridge Regression (Without Pre-Scaling)

#### Load the packages, import the data, and split the data into an X dataframe and y vector

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("./Data Files/Regression_Sample_File_2.csv")
data.head()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition
0,7129300520,221900.0,3,1.0,1180,5650,1.0,0,3
1,6414100192,538000.0,3,2.25,2570,7242,2.0,0,3
2,5631500400,180000.0,2,1.0,770,10000,1.0,0,3
3,2487200875,604000.0,4,3.0,1960,5000,1.0,0,5
4,1954400510,510000.0,3,2.0,1680,8080,1.0,0,3


In [2]:
data.columns

Index(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'condition'],
      dtype='object')

#### Split data into an X DataFrame and y vector

In [3]:
X = data[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'condition']]
y = data['price']

#### Split the data into a train_set and test_set

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1111)

#### Fit the Base Ridge Regression Model

In [7]:
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=1, normalize=True)
ridge_model.fit(X_train, y_train)

coef = np.append(ridge_model.intercept_, ridge_model.coef_)
col_names = np.append("Intercept", X.columns)
pd.DataFrame(coef, col_names, columns=["Coefficients"])

Unnamed: 0,Coefficients
Intercept,-17404.902417
bedrooms,12794.021541
bathrooms,66843.945718
sqft_living,115.750162
sqft_lot,0.071743
floors,35037.372151
waterfront,622490.948126
condition,23060.044752


#### Predict Base Model on Test Set and Evalute the model

In [8]:
y_pred = ridge_model.predict(X_test)

from sklearn import metrics
print("Base Ridge Regression Model", "\n")
print("MSE:", metrics.mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Explained Variance:", round(metrics.explained_variance_score(y_test, y_pred), 4))

Base Ridge Regression Model 

MSE: 60625435413.2
RMSE: 246222.329234
Explained Variance: 0.4634


#### Optimize the Parameters using Grid Search
Iterate multiple times if necessary

In [19]:
from sklearn.model_selection import GridSearchCV

parameters = [{"alpha": [.01, .02, .03, .04]}]
grid_model = GridSearchCV(ridge_model, param_grid = parameters, 
                          cv = 10, n_jobs = -1, verbose = 1)
grid_model.fit(X_train, y_train)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.2s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None, normalize=True,
   random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'alpha': [0.01, 0.02, 0.03, 0.04]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

#### Re-Run the Model using the "Optimal Parameters"

In [20]:
new_alpha = grid_model.best_params_
print("Best Alpha:", new_alpha)

Best Alpha: {'alpha': 0.02}


In [28]:
new_ridge_model = Ridge(alpha=new_alpha["alpha"], normalize=True)
new_ridge_model.fit(X_train, y_train)

new_coef = np.append(new_ridge_model.intercept_, new_ridge_model.coef_)
col_names = np.append("Intercept", X.columns)
pd.DataFrame(new_coef, col_names, columns=["Coefficients"])

Unnamed: 0,Coefficients
Intercept,-104455.3
bedrooms,-53091.73
bathrooms,13510.78
sqft_living,299.9288
sqft_lot,-0.2984534
floors,9109.126
waterfront,1020090.0
condition,46186.57


#### Predict on test data using refined model parameters

In [30]:
y_pred = new_ridge_model.predict(X_test)
print("Refined Ridge Regression Model Using Grid Search Parameters", "\n")
from sklearn import metrics
print("MSE:", metrics.mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Explained Variance:", round(metrics.explained_variance_score(y_test, y_pred), 4))

Refined Ridge Regression Model Using Grid Search Parameters 

MSE: 57352998485.4
RMSE: 239484.86066
Explained Variance: 0.4931
