# Ridge Regression

#### Load the packages, import the data, and split the data into an X dataframe and y vector

In [90]:
import pandas as pd
import numpy as np

data = pd.read_csv("./Data Files/Regression_Sample_File_2.csv")
data.head()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition
0,7129300520,221900.0,3,1.0,1180,5650,1.0,0,3
1,6414100192,538000.0,3,2.25,2570,7242,2.0,0,3
2,5631500400,180000.0,2,1.0,770,10000,1.0,0,3
3,2487200875,604000.0,4,3.0,1960,5000,1.0,0,5
4,1954400510,510000.0,3,2.0,1680,8080,1.0,0,3


In [91]:
data.columns

Index(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'condition'],
      dtype='object')

#### Split data into an X DataFrame and y vector

In [92]:
X = data[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'condition']]
y = data['price']

#### Split the data into a train_set and test_set

In [78]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1111)

#### Scale X

In [79]:
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
for i in X_train.columns:
    if X_train[i].dtypes in ["float64", "int64"]:
        mean_X_train = X_train[i].mean()
        std_X_train = X_train[i].std()
        X_train_scaled[i] = (X_train[i] - mean_X_train) / std_X_train
        X_test_scaled[i] = (X_test[i] - mean_X_train) / std_X_train

#### Fit the Base Ridge Regression Model

In [80]:
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=1)
ridge_model.fit(X_train_scaled, y_train)

coef = np.append(ridge_model.intercept_, ridge_model.coef_)
col_names = np.append("Intercept", X.columns)
pd.DataFrame(coef, col_names, columns=["Coefficients"])

Unnamed: 0,Coefficients
Intercept,539075.718699
bedrooms,-52868.551874
bathrooms,2235.740919
sqft_living,288771.127729
sqft_lot,-14579.48406
floors,4089.664439
waterfront,92262.291945
condition,31003.049814


#### Predict Base Model on Test Set and Evalute the model

In [81]:
y_pred = ridge_model.predict(X_test_scaled)

from sklearn import metrics
print("Base Ridge Regression Model", "\n")
print("MSE:", metrics.mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Explained Variance:", round(metrics.explained_variance_score(y_test, y_pred), 4))

Base Ridge Regression Model 

MSE: 57985055863.0
RMSE: 240800.863501
Explained Variance: 0.4876


#### Optimize the Parameters using Grid Search
Iterate multiple times if necessary

In [85]:
from sklearn.model_selection import GridSearchCV

parameters = [{"alpha": [.01, .1, 1, 10, 100]}]
grid_model = GridSearchCV(ridge_model, param_grid = parameters, 
                          cv = 10, n_jobs = -1, verbose = 1)
grid_model.fit(X_train_scaled, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=-1, param_grid=[{'alpha': [84]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

#### Re-Run the Model using the "Optimal Parameters"

In [86]:
new_alpha = grid_model.best_params_
print("Best Alpha:", new_alpha)

Best Alpha: {'alpha': 84}


In [87]:
new_ridge_model = Ridge(alpha=new_alpha["alpha"])
new_ridge_model.fit(X_train_scaled, y_train)

new_coef = np.append(new_ridge_model.intercept_, new_ridge_model.coef_)
col_names = np.append("Intercept", X.columns)
pd.DataFrame(new_coef, col_names, columns=["Coefficients"])

Unnamed: 0,Coefficients
Intercept,539075.718699
bedrooms,-48588.260394
bathrooms,9166.549198
sqft_living,276073.289684
sqft_lot,-13197.526733
floors,4565.073308
waterfront,91715.099499
condition,30643.267046


#### Predict on test data using refined model parameters

In [89]:
y_pred = new_ridge_model.predict(X_test_scaled)
print("Refined Ridge Regression Model Using Grid Search Parameters", "\n")
from sklearn import metrics
print("MSE:", metrics.mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Explained Variance:", round(metrics.explained_variance_score(y_test, y_pred), 4))

Refined Ridge Regression Model Using Grid Search Parameters 

MSE: 57434000030.7
RMSE: 239653.917203
Explained Variance: 0.4924
