This note is extended from the midterm practice notebook with a focus on <b><i>transformation pipeline</i> </b>

It is still to train a linear regressor. We will use Ridge regression with Stochastic Gradient Descent (class SGDRegressor)

The dataset is the housing mentioned in chapter 2.

Here are steps we need to do

1. download the raw dataset

2. create train, test sets. We will use cross-validation for the purpose of the dev set.

3. prepare data for training: <b>all the steps in this stage are organized in a pipeline</b>

    + handle missing values, using median for numerical features, the most frequent category for categorical features
        
    + transform text -> one-hot vectors
    
    + scale all features, using StandardScaler


4. train a model:
    
    + model choice: SGDRegressor
    
    + using grid search for hyperparameter tuning: learning rate and regularization (L2 norm) coefficent.
    
5. evaluate the final model, using RMSE metric


In [39]:
#helping functions
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import IsolationForest

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor

from sklearn import set_config

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_squared_error
import numpy as np
import sys
import io
import matplotlib.pyplot as plt

from scipy import stats

In [55]:
def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    
    return pd.read_csv(Path("datasets/housing/housing.csv"))

def train_test_split(X, y, test_ratio = 0.2):
    total_size = len(X)
    print(total_size)

    test_size = int(total_size * test_ratio)
    
    
    train_size = total_size - test_size

    np.random.seed(42)
    rnd_indices = np.random.permutation(total_size)

    X_train = X.iloc[rnd_indices[:train_size]]
    y_train = y.iloc[rnd_indices[:train_size]]
    
    X_test = X.iloc[rnd_indices[train_size:]]
    y_test = y.iloc[rnd_indices[train_size:]]

    return X_train, X_test, y_train, y_test
      

In [79]:

    #1 load data
    
    housing = load_housing_data() # housing is a dataframe
    housing_X = housing.drop("median_house_value", axis=1)
    housing_y = housing["median_house_value"].copy()
    
    #5--- Attempting Bonus Here 
    housing = housing.assign(bedrooms_ratio = housing.total_bedrooms/housing.total_rooms)
    housing = housing.assign(rooms_per_house = housing.total_rooms/housing.households)
    housing = housing.assign(people_per_house = housing.population / housing.households)
    
    #2 split train, test sets
    
    housing_Xtrain, housing_Xtest, housing_ytrain, housing_ytest = train_test_split(housing_X, housing_y, test_ratio = 0.2)
    
    #---Outlier Removal 
    df = pd.concat([housing_Xtrain,housing_ytrain], axis=1) #join training set back together
    newdf = df.select_dtypes(include=np.number) #drop non-numerical column
    newdf = newdf.drop("total_bedrooms", axis='columns') #dropped bedrooms because for some reason Zscaler was throwing errors 
    
    newdf = newdf[(np.abs(stats.zscore(newdf)) < 3).all(axis=1)]
    #the above first finds z values of all data points, then drops the ones more than 3 std devs
    
    df = df[df.index.isin(newdf.index)]
    housing_X_train = df.drop("median_house_value", axis=1)
    housing_y_train = df["median_house_value"].copy() 
    
    #3 prepare data for training
    
    #---using transformation pipeline instead of the manual way we did in the midterm practice
    #---handle missing values
    #---StandardScaler for numerical values
    #---transform categorical feature -> one-hot vector
    
    num_attribs = ["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms",\
                  "population", "households", "median_income"]
    
    cat_attribs = ["ocean_proximity"]
    
    num_pipeline = make_pipeline(SimpleImputer(strategy = 'median'),\
                                 StandardScaler())
    
    cat_pipeline = make_pipeline(SimpleImputer(strategy = "most_frequent"),\
                                 OneHotEncoder(handle_unknown='ignore'))

    preprocessing = ColumnTransformer([("num", num_pipeline, num_attribs),\
                                       ("cat", cat_pipeline, cat_attribs)], )
    
    
    #4 train
    
    m = len(housing_Xtrain)
    
    param_grid = {'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1],  
                  'eta0': [1, 0.1, 0.01, 0.001, 0.0001], 
                  'penalty':['l2'],
                  'random_state': [42],
                  'max_iter':[1000]}  

    grid = GridSearchCV(SGDRegressor(), param_grid, refit = True, verbose = 1,n_jobs=-1, cv = 3) 
    
    full_pipeline = Pipeline([("preprocessing", preprocessing),("grid_search", grid)])


    print("Training ...")

    full_pipeline.fit(housing_Xtrain, housing_ytrain)
    
    # print best parameter after tuning 
    
    print("Training done, best hyperparams: ")
    print(grid.best_params_)
    
    # evaluate the final model
    
    final_predictions = full_pipeline.predict(housing_Xtest)

    final_rmse = mean_squared_error(housing_ytest, final_predictions, squared=False)
    print("test RMSE: ", final_rmse) # prints
    

20640
(15799, 10)
Training ...
Fitting 3 folds for each of 25 candidates, totalling 75 fits
Training done, best hyperparams: 
{'alpha': 0.001, 'eta0': 0.001, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 42}
test RMSE:  68927.76074212731
