In [53]:
import pandas as pd
import numpy as np
import tarfile
import requests
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import itertools

# Todo
- [x] create a ColumnSelector class
- [x] use simple imputer to fill in na values
- [x] Normalise all numeric columns
- [ ] Build pipeline
- [ ] Run KNN Regressor

In [9]:
HOUSING_URL = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz"
DATA_PATH = Path("./datasets/housing")
def fetch_housing_data(data_url=HOUSING_URL, file_path=DATA_PATH):
    data = requests.get(data_url).content
    file_path.mkdir(exist_ok=True, parents=True)
    tgz_path = f"{DATA_PATH}/housing.tgz"
    with open(tgz_path, "wb") as tgz_file:
        tgz_file.write(data)    
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=file_path)
    housing_tgz.close()

fetch_housing_data()

In [12]:
def load_housing_data(housing_path=DATA_PATH):
    return pd.read_csv(f"{housing_path}/housing.csv")

housing = load_housing_data()

In [13]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [23]:
X = housing.drop(["median_house_value"], axis=1)
y = housing["median_house_value"]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [33]:
print(
f"""
    X size: {X.size}
    x train: {x_train.size / X.size} ({x_train.size})
    x test: {x_test.size / X.size} ({x_test.size})
"""
)


    X size: 185760
    x train: 0.8 (148608)
    x test: 0.2 (37152)



In [40]:
x_train.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 14196 to 15795
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16512 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   ocean_proximity     16512 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.3+ MB


In [39]:
numeric_cols = list(x_train.drop(["ocean_proximity"], axis=1).columns)

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']


In [42]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, desired_columns):
        self.desired_columns = desired_columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.desired_columns].values


In [61]:
pipeline = Pipeline([
    ("column_selector", ColumnSelector(numeric_cols)),
    ("imputer", SimpleImputer(strategy="median")),
    ("standard scalers", StandardScaler())
])
x_train_prepared = pipeline.fit_transform(x_train)
x_test_prepared = pipeline.fit_transform(x_test)

In [68]:
def test_knn(desired_columns, weights="uniform", n_neighbors=3):
    pipeline = Pipeline([
        ("column_selector", ColumnSelector(numeric_cols)),
        ("imputer", SimpleImputer(strategy="median")),
        ("standard scalers", StandardScaler())
    ])
    x_train_prepared = pipeline.fit_transform(x_train)
    x_test_prepared = pipeline.fit_transform(x_test)
    
    knn_regressor = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights)
    knn_regressor.fit(x_train_prepared, y_train)
    housing_predictions = knn_regressor.predict(x_test_prepared)
    mse = mean_squared_error(y_test, housing_predictions)
    rmse = np.sqrt(mse)
    return rmse
    

In [58]:
knn_regressor = KNeighborsRegressor(n_neighbors=3)
knn_regressor.fit(x_train_prepared, y_train)

KNeighborsRegressor(n_neighbors=3)

In [62]:
housing_predictions = knn_regressor.predict(x_test_prepared)
mse = mean_squared_error(y_test, housing_predictions)
rmse = np.sqrt(mse)
rmse

64725.78694113039

8
