# Lasso Regression

#### Load the packages, import the data, and split the data into an X dataframe and y vector

In [43]:
import pandas as pd
import numpy as np

data = pd.read_csv("./Data Files/Regression_Sample_File_2.csv")
data.head()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition
0,7129300520,221900.0,3,1.0,1180,5650,1.0,0,3
1,6414100192,538000.0,3,2.25,2570,7242,2.0,0,3
2,5631500400,180000.0,2,1.0,770,10000,1.0,0,3
3,2487200875,604000.0,4,3.0,1960,5000,1.0,0,5
4,1954400510,510000.0,3,2.0,1680,8080,1.0,0,3


In [44]:
data.columns

Index(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'condition'],
      dtype='object')

#### Split data into an X DataFrame and y vector

In [46]:
X = data[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'condition']]
y = data['price']

#### Split the data into a train_set and test_set

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1111)

#### Scale X

In [48]:
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
for i in X_train.columns:
    if X_train[i].dtypes in ["float64", "int64"]:
        mean_X_train = X_train[i].mean()
        std_X_train = X_train[i].std()
        X_train_scaled[i] = (X_train[i] - mean_X_train) / std_X_train
        X_test_scaled[i] = (X_test[i] - mean_X_train) / std_X_train

#### Fit the Base Lasso Regression Model

In [49]:
from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha=1)
lasso_model.fit(X_train_scaled, y_train)

coef = np.append(lasso_model.intercept_, lasso_model.coef_)
col_names = np.append("Intercept", X.columns)
pd.DataFrame(coef, col_names, columns=["Coefficients"])

Unnamed: 0,Coefficients
Intercept,539075.718699
bedrooms,-52603.921464
sqft_living,290192.71262
sqft_lot,-14633.224095
floors,4573.942441
waterfront,92286.162078
condition,30966.687451


#### Predict Base Model on Test Set and Evalute the model

In [50]:
y_pred = lasso_model.predict(X_test_scaled)

from sklearn import metrics
print("Base Lasso Regression Model", "\n")
print("MSE:", metrics.mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Explained Variance:", round(metrics.explained_variance_score(y_test, y_pred), 4))

Base Lasso Regression Model 

MSE: 57989761493.8
RMSE: 240810.634096
Explained Variance: 0.4875


#### Optimize the Parameters using Grid Search
Iterate multiple times if necessary

In [53]:
from sklearn.model_selection import GridSearchCV

parameters = [{"alpha": [.01, .1, 1, 10, 100]}]
grid_model = GridSearchCV(lasso_model, param_grid = parameters, 
                          cv = 10, n_jobs = -1, verbose = 1)
grid_model.fit(X_train_scaled, y_train)

Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=Lasso(alpha=1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'alpha': [1000, 1500, 2000]}], pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=1)

#### Re-Run the Model using the "Optimal Parameters"

In [54]:
new_alpha = grid_model.best_params_
print("Best Alpha:", new_alpha)

Best Alpha: {'alpha': 1500}


In [55]:
new_lasso_model = Lasso(alpha=new_alpha["alpha"])
new_lasso_model.fit(X_train_scaled, y_train)

new_coef = np.append(new_lasso_model.intercept_, new_lasso_model.coef_)
col_names = np.append("Intercept", X.columns)
pd.DataFrame(new_coef, col_names, columns=["Coefficients"])

Unnamed: 0,Coefficients
Intercept,539075.718699
bedrooms,-48953.027349
sqft_living,286950.096553
sqft_lot,-12778.460339
floors,3172.830623
waterfront,91271.002999
condition,29016.619817


#### Predict on test data using refined model parameters

In [56]:
y_pred = new_lasso_model.predict(X_test_scaled)
print("Refined Lasso Regression Model Using Grid Search Parameters", "\n")
print("MSE:", metrics.mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Explained Variance:", round(metrics.explained_variance_score(y_test, y_pred), 4))

Refined Lasso Regression Model Using Grid Search Parameters 

MSE: 57676538049.3
RMSE: 240159.401334
Explained Variance: 0.4903
