In [33]:
# import libraries

import pandas as pd
import numpy as np

In [133]:
# importing data

# define the path to the dataset

path = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'

# read the dataset using pandas

dataset = pd.read_csv(path)

In [134]:
# prepare the dataset

subset = dataset[(dataset['ocean_proximity'] == '<1H OCEAN') | (dataset['ocean_proximity'] == 'INLAND')]

subset = subset[['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']]

subset.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
701,37.64,-121.97,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0
830,37.61,-121.99,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0
859,37.57,-121.97,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0
860,37.58,-121.96,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0
861,37.58,-121.98,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0


In [36]:
# Question 1 - total bedrooms

subset.isna().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [37]:
# Question 2 - 1195

subset.population.median()

1195.0

In [38]:
# Prepare and split the dataset

n = len(subset)
n_test = int(0.2 * n)
n_val = int(0.2 * n)
n_train = n - n_test - n_val

print(n, n_test, n_val, n_train)

idx = np.arange(n)
np.random.seed(42)

np.random.shuffle(idx)

subset['median_house_value'] = np.log1p(subset['median_house_value'])

subset_test = subset.iloc[idx[:n_test]].reset_index(drop=True)
subset_val = subset.iloc[idx[n_test:n_test+n_val]].reset_index(drop=True)
subset_train = subset.iloc[idx[n_test+n_val:]].reset_index(drop=True)

15687 3137 3137 9413


In [39]:
# Question 3 - Both are equally good

# fill the missing values with 0 and mean

null_mean = round(subset_train.total_bedrooms.mean(),2)
print(null_mean)

subset_train_mean = subset_train.copy()
subset_test_mean = subset_test.copy()
subset_val_mean = subset_val.copy()

subset_train_mean.total_bedrooms = subset_train_mean.total_bedrooms.fillna(null_mean)
subset_test_mean.total_bedrooms = subset_test_mean.total_bedrooms.fillna(null_mean)
subset_val_mean.total_bedrooms = subset_val_mean.total_bedrooms.fillna(null_mean)

subset_train_zero = subset_train.copy()
subset_test_zero = subset_test.copy()
subset_val_zero = subset_val.copy()

subset_train_zero.total_bedrooms = subset_train_zero.total_bedrooms.fillna(0)
subset_test_zero.total_bedrooms = subset_test_zero.total_bedrooms.fillna(0)
subset_val_zero.total_bedrooms = subset_val_zero.total_bedrooms.fillna(0)

538.3


In [40]:
# function 

def train_linear_regression(X, y):
    
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [41]:
# prepare the data for the model

y_train_zero = subset_train_zero.median_house_value.values
y_test_zero = subset_test_zero.median_house_value.values
y_val_zero = subset_val_zero.median_house_value.values

del subset_train_zero['median_house_value']
del subset_test_zero['median_house_value']
del subset_val_zero['median_house_value']

y_train_mean = subset_train_mean.median_house_value.values
y_test_mean = subset_test_mean.median_house_value.values
y_val_mean = subset_val_mean.median_house_value.values

del subset_train_mean['median_house_value']
del subset_test_mean['median_house_value']
del subset_val_mean['median_house_value']

In [67]:
X_zero = subset_train_zero.values
X_mean = subset_train_mean.values

In [114]:
w0_zero, w_zero = train_linear_regression(X_zero, y_train_zero)
y_pred_zero = w0_zero + X_zero.dot(w_zero)
y_pred_zero

array([11.99595522, 12.00432589, 12.04826463, ..., 11.9961922 ,
       11.99818968, 11.99481275])

In [111]:
w0_mean, w_mean = train_linear_regression(X_mean, y_train_mean)
y_pred_mean = w0_mean + X_mean.dot(w_mean)
y_pred_mean

array([11.57338822, 12.35183315, 12.68877625, ..., 12.05272246,
       11.79948975, 12.47390043])

In [112]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [113]:
rmse(y_train_zero, y_pred_zero)

0.726688873733112

In [107]:
rmse(y_train_mean, y_pred_mean)

0.33610609052725504

In [108]:
# Question 4 - 

# regularize the function

def train_linear_regression_reg(X, y, r):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [109]:
# try for different r values

for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w0, w = train_linear_regression_reg(X_zero, y_train_zero, r)
    y_pred_reg = w0 + X_zero.dot(w)
    print(r, '-',round(rmse(y_train_mean, y_pred_reg),2))

0 - 0.57
1e-06 - 0.57
0.0001 - 0.57
0.001 - 0.57
0.01 - 0.57
0.1 - 0.56
1 - 0.55
5 - 0.55
10 - 0.55


In [126]:
n = len(subset)
n_test = int(0.2 * n)
n_val = int(0.2 * n)
n_train = n - n_test - n_val
idx = np.arange(n)

pred = []

for s in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:

    np.random.seed(s)
    np.random.shuffle(idx)

    subset_seed = subset.copy()
    subset_seed.total_bedrooms = subset_seed.total_bedrooms.fillna(0)

    subset_seed['median_house_value'] = np.log1p(subset_seed['median_house_value'])

    subset_seed_test = subset_seed.iloc[idx[:n_test]].reset_index(drop=True)
    subset_seed_val = subset_seed.iloc[idx[n_test:n_test+n_val]].reset_index(drop=True)
    subset_seed_train = subset_seed.iloc[idx[n_test+n_val:]].reset_index(drop=True)

    y_train_seed = subset_seed_train.median_house_value.values
    y_test_seed = subset_seed_test.median_house_value.values
    y_val_seed = subset_seed_val.median_house_value.values

    del subset_seed_train['median_house_value']
    del subset_seed_test['median_house_value']
    del subset_seed_val['median_house_value']

    X_seed = subset_seed_train.values
    X_val_seed = subset_seed_val.values

    w0_seed, w_seed = train_linear_regression(X_seed, y_train_seed)
    y_pred_seed = w0_seed + X_val_seed.dot(w_seed)

    pred.append(rmse(y_val_seed, y_pred_seed))

In [131]:
# find the standard deviation

print(round(np.std(pred),3))

0.006


In [137]:
np.random.seed(9)
np.random.shuffle(idx)

subset_reg = subset.copy()
subset_reg.total_bedrooms = subset_reg.total_bedrooms.fillna(0)

subset_reg['median_house_value'] = np.log1p(subset_reg['median_house_value'])

subset_reg_test = subset_reg.iloc[idx[:n_test]].reset_index(drop=True)
subset_reg_val = subset_reg.iloc[idx[n_test:n_test+n_val]].reset_index(drop=True)
subset_reg_train = subset_reg.iloc[idx[n_test+n_val:]].reset_index(drop=True)

subset_train_val = pd.concat([subset_reg_train, subset_reg_val])

y_train_val = subset_train_val.median_house_value.values
y_test_reg = subset_reg_test.median_house_value.values

del subset_train_val['median_house_value']
del subset_reg_test['median_house_value']

X_reg = subset_train_val.values
X_test_reg = subset_seed_test.values

w0, w = train_linear_regression(X_reg, y_train_val)
y_pred = w0_seed + X_test_reg.dot(w_seed)

print(rmse(y_test_seed, y_pred))

0.3308655361593345
