<a href="https://www.kaggle.com/code/dheerajrhegde/3-some-advanced-regression-options?scriptVersionId=147618356" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import os
import pandas as pd
import numpy as np
def load_housing_data():
    csv_path = os.path.join("/kaggle/input/housing-dataset/housing.csv")
    return pd.read_csv(csv_path)
housing = load_housing_data()

In [2]:
test_ratio = 0.2

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit
housing['income_category'] = pd.cut(housing['median_income'], bins=[0,3,6,9,12,np.inf], labels=[0,1,2,3,4])
split = StratifiedShuffleSplit(n_splits=1, test_size=test_ratio)
for train_index, test_index in split.split(housing, housing['income_category']):
    housing_train = pd.DataFrame(data = housing.iloc[train_index], columns = housing.columns.tolist())
    housing_test = pd.DataFrame(data = housing.iloc[test_index], columns = housing.columns.tolist())

In [4]:
housing_train.shape, housing_test.shape

((16512, 11), (4128, 11))

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder

#dropping "income_category" since it purpose of splitting data is fulfilled
housing_test.drop('income_category', axis=1, inplace=True)
housing_train.drop('income_category', axis=1, inplace=True)
housing.drop('income_category', axis=1, inplace=True)

housing_train_X = housing_train.drop('median_house_value', axis=1)
housing_train_y = np.array(housing_train['median_house_value']).reshape(housing_train.shape[0],1)

housing_test_X = housing_test.drop('median_house_value', axis=1)
housing_test_y = np.array(housing_test['median_house_value']).reshape(housing_test.shape[0],1)

housing_X = housing.drop('median_house_value', axis=1)
housing_y = np.array(housing['median_house_value']).reshape(housing.shape[0],1)

numerical_data_column_index = [0,1,2,3,4,5,6,7]
categorical_data_column_index = [8]

In [6]:
housing_train_X.shape, housing_test_X.shape

((16512, 9), (4128, 9))

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
class AttributeAdder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        bedrm_per_room = X[:,4] / X[:,3]
        rm_per_household = X[:,3] / X[:,6]
        return np.c_[X, bedrm_per_room, rm_per_household]

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribute_added', AttributeAdder()),
    ('std_scaler', StandardScaler())
])
full_pipeline = ColumnTransformer([
    ("numeric", numeric_pipeline, numerical_data_column_index),
    ('categorical',OneHotEncoder(), categorical_data_column_index)
])
housing_train_X = full_pipeline.fit_transform(housing_train_X)
housing_test_X = full_pipeline.fit_transform(housing_test_X)
housing_X = full_pipeline.fit_transform(housing_X)

In [8]:
housing_train_X.shape, housing_test_X.shape

((16512, 15), (4128, 15))

In [9]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

abr = AdaBoostRegressor()
param_grid = [
    {'n_estimators':[10,100], 'learning_rate':[1,1.1, 1.2]}
]
grid = GridSearchCV(abr, param_grid=param_grid, cv=5, return_train_score=True)
grid.fit(housing_train_X, housing_train_y.ravel())

In [10]:
print(grid.best_estimator_)
print(grid.best_score_)

AdaBoostRegressor(learning_rate=1, n_estimators=10)
0.5384757861588477


In [11]:
from sklearn.metrics import mean_squared_error
abr_prediction = grid.best_estimator_.predict(housing_train_X)
np.sqrt(mean_squared_error(housing_train_y, abr_prediction))

78541.54946537591

In [12]:
abr_prediction = grid.best_estimator_.predict(housing_test_X)
np.sqrt(mean_squared_error(housing_test_y, abr_prediction))

77901.53882960914

### Since the train set error and test set error are close, this is an issue with bias. We should try
- getting more data 
- Running the data longer

In [13]:
from sklearn.ensemble import GradientBoostingRegressor
param_grid = [
    {'n_estimators':[10,50,100,1000], 'learning_rate':[0.1, 0.001, 0]}
]
gbr = GradientBoostingRegressor()
grid = GridSearchCV(gbr, param_grid=param_grid, cv=5, return_train_score=True)
grid.fit(housing_train_X, housing_train_y.ravel())

In [14]:
from sklearn.metrics import mean_squared_error
gbr_prediction = grid.best_estimator_.predict(housing_train_X)
np.sqrt(mean_squared_error(housing_train_y, gbr_prediction))

37072.666410411264

In [15]:
gbr_prediction = grid.best_estimator_.predict(housing_test_X)
np.sqrt(mean_squared_error(housing_test_y, gbr_prediction))

48952.87478678292

### looks like we have both bias and variance in the model

In [16]:
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor

nn = NearestNeighbors(n_neighbors=5)
nn.fit(housing_train_X, housing_train_y)
nn_pred = nn.kneighbors(housing_test_X, return_distance=False)

In [17]:
knr = KNeighborsRegressor()
knr.fit(housing_train_X, housing_train_y)
knr_pred = knr.predict(housing_train_X)
np.sqrt(mean_squared_error(housing_train_y, knr_pred))

49617.98252128907