### Your name:

<pre> Name</pre>

### Collaborators:

<pre> None</pre>


In [0]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

Open the housing data


In [2]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### Build full pipeline for the data analysis following the example of the notebook.
 Hint: the main part requested to change is the algorithm used (Lasso regression)

If you want to learn more about the Lasso regression, see resources below:
- http://scikit-learn.org/stable/modules/linear_model.html#lasso
- https://www.analyticsvidhya.com/blog/2016/01/complete-tutorial-ridge-lasso-regression-python/

#### Considerations for building pipeline:

- Split data into training and testing sets below.
- Convert all categorical data to one-hot vectors below
- Normalize all non-categorical data 
-  Perform Lasso-based regression using a variety of values for $\alpha$ between 0 and 1 via a grid search where  *housing_labels* is the output and all other features are the input (similar to as seen in lecture two.)

In [0]:
from sklearn.linear_model import Lasso

# Write your code here:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
#from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]



In [0]:
# train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

# Divide by 1.5 to limit the number of income categories
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
# Label those above 5 as 5
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()

housing_test = strat_test_set.drop("median_house_value", axis=1) # drop labels for training set
housing_test_labels = strat_test_set["median_house_value"].copy()

In [5]:
housing_num = housing.drop('ocean_proximity', axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)
print(housing_prepared.shape)

housing_test_prepared = full_pipeline.transform(housing_test)
print(housing_test_prepared.shape)

(16512, 19)
(4128, 19)


In [6]:
housing_prepared[:2]

array([[-1.15604281,  0.77194962,  0.74333089, -0.49323393, -0.44543821,
        -0.63621141, -0.42069842, -0.61493744, -0.31205452,  0.19380531,
        -0.08649871, -0.31205452, -0.08649871,  0.15531753,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , -0.90896655, -1.0369278 ,
        -0.99833135, -1.02222705,  1.33645936,  0.21768338, -0.94074539,
        -0.03353391,  0.21768338, -0.03353391, -0.83628902,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ]])

## Linear Model

In [7]:
# Linear Model
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [8]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

Predictions: [209420.50610494 315409.32621299 210124.77314125  55983.75406116
 183462.63421725]


In [9]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68147.95744947501

In [10]:
from sklearn.metrics import mean_squared_error

housing_test_predictions = lin_reg.predict(housing_test_prepared)
lin_mse = mean_squared_error(housing_test_labels, housing_test_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

66809.09105899581

In [11]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# Test alpha 0 - 1 (inclusive) with 0.1 step difference
alphas = np.arange(0.0, 1.1, 0.1)

param_grid = [
    {'alpha':alphas}
]

lasso = Lasso()

grid_search = GridSearchCV(lasso, param_grid, scoring = "neg_mean_squared_error")
grid_search.fit(housing_prepared, housing_labels)

  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'alpha': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [0]:
grid_search.best_estimator_

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

<pre> Alpha = 1.0 was chosen as the best parameter after conducting a grid search</pre>

In [0]:
# Lasso
from sklearn.linear_model import Lasso

lin_lasso = Lasso(alpha=1.0, max_iter=5000 )
lin_lasso.fit(housing_prepared, housing_labels)
print(lin_lasso.coef_)
print(lin_lasso.n_iter_)   # Number of iterations performed within

[-5.61106676e+04 -5.67077319e+04  1.39708401e+04  7.32258879e+03
  2.20712558e+03 -4.59252628e+04  4.14547980e+04  7.83379221e+04
  1.81891909e+00  1.91014479e+04  8.70819857e+02  7.14695595e+03
  2.19807602e+01 -2.81829596e+03  0.00000000e+00 -3.41823692e+04
  1.14835970e+05 -4.71655249e+03  3.35911897e+03]
2671


In [0]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_lasso.predict(housing_prepared)
lin_lasso_mse = mean_squared_error(housing_labels, housing_predictions)
lin_lasso_rmse = np.sqrt(lin_lasso_mse)
lin_lasso_rmse

68148.01867810891

In [0]:
housing_test_predictions = lin_lasso.predict(housing_test_prepared)
lin_lasso_mse = mean_squared_error(housing_test_labels, housing_test_predictions)
lin_lasso_rmse = np.sqrt(lin_lasso_mse)
lin_lasso_rmse

66812.89771344795


Why is it necessary to normalize all continuous variables before performing Lasso? (OPTIONAL)

<pre>The main feature of Lasso regression is that it is based on the relative importance of each feature. This is only possible when all the features passed into the model are of the same scale, which is why normalization is necessary. For example, if normalization was not done, the house price which may be in millions will always be more important than number of rooms which is in single digits, simply due to the size of the number. </pre>

### Conclusions
For what values of $\alpha$ does Lasso perform best? Does it perform as well on the housing data as the linear regressor from the lectures? Why do you think this is?

### Read appending B

- Reflect on your last data project, read appendix B. Then, write down a few of the checklist items that your last data project could have used. If you have not yet done a data project, then write down a few of the items that you found most interesting.


<pre>Each student will have a different experience </pre>