In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

--2022-09-18 14:05:18--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘housing.csv’


2022-09-18 14:05:20 (1.18 MB/s) - ‘housing.csv’ saved [1423529/1423529]



In [2]:
import pandas as pd
data = pd.read_csv("housing.csv")

In [3]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [6]:
data = data[[
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value'
]]

In [7]:
data.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [8]:
data['population'].median()

1166.0

In [9]:
import numpy as np
idx = np.arange(len(data))
np.random.seed(42)
np.random.shuffle(idx)

val = int(len(data)*0.2)
test = int(len(data)*0.2)
train = len(data) - val - test

data_train = data.iloc[idx[:train]]
data_val = data.iloc[idx[train:train+val]]
data_test = data.iloc[idx[train+val:]]

y_train = np.log1p(data_train['median_house_value'].values)
y_val = np.log1p(data_val['median_house_value'].values)
y_test = np.log1p(data_test['median_house_value'].values)

del data_train['median_house_value']
del data_val['median_house_value']
del data_test['median_house_value']

In [10]:
def lin_reg(X, y, r = 0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX += r * np.eye(XTX.shape[0])
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [11]:
def rmse(y_pred, y):
    sq_err = (y - y_pred)**2
    mse = sq_err.mean()
    
    return np.sqrt(mse)

In [13]:
def find_X(data, missing_value=0):
    data = data.copy()
    data = data.fillna(missing_value)
    X = data.values
    
    return X

In [14]:
# With 0
X_train = find_X(data_train)
w0, w = lin_reg(X_train, y_train, r=0)

X_val = find_X(data_val)
y_val_pred = w0 + X_val.dot(w)
rmse_score = round(rmse(y_val_pred, y_val), 2)
rmse_score

0.33

In [15]:
# With Mean
X_train = find_X(data_train, missing_value=data_train['total_bedrooms'].mean())
w0, w = lin_reg(X_train, y_train, r=0)

X_val = find_X(data_val, missing_value=data_train['total_bedrooms'].mean())
y_val_pred = w0 + X_val.dot(w)
rmse_score = round(rmse(y_val_pred, y_val), 2)
rmse_score

0.33

In [16]:
r_val = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]


for r in r_val:
    X_train = find_X(data_train, missing_value=0)
    w0, w = lin_reg(X_train, y_train, r=r)

    X_val = find_X(data_val, missing_value=0)
    y_val_pred = w0 + X_val.dot(w)
    rmse_score = round(rmse(y_val_pred, y_val), 2)
    print(f"for r = {r} the rmse = {rmse_score}")

for r = 0 the rmse = 0.33
for r = 1e-06 the rmse = 0.33
for r = 0.0001 the rmse = 0.33
for r = 0.001 the rmse = 0.33
for r = 0.01 the rmse = 0.33
for r = 0.1 the rmse = 0.33
for r = 1 the rmse = 0.33
for r = 5 the rmse = 0.34
for r = 10 the rmse = 0.34


In [21]:
rmse_fres = []

for seed in range(10):

    idx = np.arange(len(data))
    np.random.seed(seed)
    np.random.shuffle(idx)

    val = int(len(data)*0.2)
    test = int(len(data)*0.2)
    train = len(data) - val - test

    data_train = data.iloc[idx[:train]]
    data_val = data.iloc[idx[train:train+test]]
    data_test = data.iloc[idx[train+test:]]

    y_train = np.log1p(data_train['median_house_value'].values)
    y_val = np.log1p(data_val['median_house_value'].values)
    y_test = np.log1p(data_test['median_house_value'].values)

    del data_train['median_house_value']
    del data_val['median_house_value']
    del data_test['median_house_value']
    
    
    X_train = find_X(data_train, missing_value=0)
    w0, w = lin_reg(X_train, y_train, r=0)

    X_val = find_X(data_val, missing_value=0)
    y_val_pred = w0 + X_val.dot(w)
    rmse_res = rmse(y_val_pred, y_val)
    print(f"for seed = {seed} the rmse = {rmse_res}")
    
    rmse_fres.append(rmse_res)

out = round(np.std(np.array(rmse_fres)), 3)
out

for seed = 0 the rmse = 0.33884304805321186
for seed = 1 the rmse = 0.3362387255956888
for seed = 2 the rmse = 0.33209123188276174
for seed = 3 the rmse = 0.3405153609034065
for seed = 4 the rmse = 0.33890240665744614
for seed = 5 the rmse = 0.34348667257183724
for seed = 6 the rmse = 0.34519809530994167
for seed = 7 the rmse = 0.33959899274091027
for seed = 8 the rmse = 0.34662308731747465
for seed = 9 the rmse = 0.336592612418177


0.004

In [22]:
idx = np.arange(len(data))
np.random.seed(9)
np.random.shuffle(idx)

val = int(len(data)*0.2)
test = int(len(data)*0.2)
train = len(data) - val - test

data_train = data.iloc[idx[:train]]
data_val = data.iloc[idx[train:train+test]]
data_test = data.iloc[idx[train+test:]]

y_train = np.log1p(data_train['median_house_value'].values)
y_val = np.log1p(data_val['median_house_value'].values)
y_test = np.log1p(data_test['median_house_value'].values)

del data_train['median_house_value']
del data_val['median_house_value']
del data_test['median_house_value']

ftrain = data_train.append(data_val)
Y_train = np.append(y_train, y_val)

X_train = find_X(ftrain, missing_value=0)
w0, w = lin_reg(X_train, Y_train, r=0.001)

X_test = find_X(data_test, missing_value=0)
y_test_pred = w0 + X_test.dot(w)
rmse_res = rmse(y_test_pred, y_test)

rmse_res

0.34531689143805994