In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
df_ = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv")
df_.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.325,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.301,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.257,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.643,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.846,342200.0,NEAR BAY


In [3]:
df_ = df_[(df_.ocean_proximity == "<1H OCEAN") | (df_.ocean_proximity == "INLAND")]
df_.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
701,-121.97,37.64,32.0,1283.0,194.0,485.0,171.0,6.057,431000.0,<1H OCEAN
830,-121.99,37.61,9.0,3666.0,711.0,2341.0,703.0,4.646,217000.0,<1H OCEAN
859,-121.97,37.57,21.0,4342.0,783.0,2172.0,789.0,4.615,247600.0,<1H OCEAN
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.719,283500.0,<1H OCEAN
861,-121.98,37.58,20.0,4126.0,1031.0,2079.0,975.0,3.683,216900.0,<1H OCEAN


In [4]:
df = df_[["latitude","longitude","housing_median_age","total_rooms","total_bedrooms", "population","households","median_income","median_house_value"]]
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,37.64,-121.97,32.0,1283.0,194.0,485.0,171.0,6.057,431000.0
1,37.61,-121.99,9.0,3666.0,711.0,2341.0,703.0,4.646,217000.0
2,37.57,-121.97,21.0,4342.0,783.0,2172.0,789.0,4.615,247600.0
3,37.58,-121.96,15.0,3575.0,597.0,1777.0,559.0,5.719,283500.0
4,37.58,-121.98,20.0,4126.0,1031.0,2079.0,975.0,3.683,216900.0


In [5]:
print(f"Q1 : {df.columns[df.isnull().any()].tolist()}")

Q1 : ['total_bedrooms']


In [6]:
print(f"Q2 : {df.population.quantile(0.5)}")
# or print(f"Q2 : {np.median(df.population)}")

Q2 : 1195.0


In [7]:
n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

df_train.reset_index(inplace=True, drop=True)
df_val.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)

y_train_1 = df_train.median_house_value.values
y_val_1 = df_val.median_house_value.values
y_test_1 = df_test.median_house_value.values

y_train = np.log1p(y_train_1)
y_val = np.log1p(y_val_1)
y_test = np.log1p(y_test_1)

del df_train["median_house_value"]
del df_val["median_house_value"]
del df_test["median_house_value"]


In [9]:
#LR
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

#RMSE
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [10]:
# Computing the trainin mean
total_bedrooms_mean = df_train['total_bedrooms'].mean()
total_bedrooms_mean

542.552956325786

In [11]:
def prepare_data(df, mean_replace_value=None):
    df_temp = df.copy()
    if mean_replace_value is None:
        X = df_temp.values

    else:
        df_temp['total_bedrooms'] = df_temp['total_bedrooms'].fillna(mean_replace_value)
        X = df_temp.values
        #print(replace_value)

    return X

In [12]:
X_train = prepare_data(df_train, 0)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_data(df_val, 0)
y_pred = w0 + X_val.dot(w)
rmse_zero_fill = round(rmse(y_val, y_pred), 2)
rmse_zero_fill

0.34

In [13]:
X_train = prepare_data(df_train, total_bedrooms_mean)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_data(df_val, total_bedrooms_mean)
y_pred = w0 + X_val.dot(w)
rmse_mean_fill = round(rmse(y_val, y_pred), 2)
rmse_mean_fill

0.34

In [14]:
#Regularization
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

In [15]:
X_train = prepare_data(df_train,0)

In [16]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    print(r,round( w_0,2))

0 -9.76
1e-06 -9.76
0.0001 -9.76
0.001 -9.74
0.01 -9.56
0.1 -8.06
1 -3.13
5 -0.84
10 -0.44


In [18]:
results = dict()
X_train = prepare_data(df_train, 0)
X_val = prepare_data(df_val, 0)
best_rmse = None
best_rmse_arg = None

for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    print('%6s' %r, rmse(y_val, y_pred))
    raw_rmse = rmse(y_val, y_pred)
    if best_rmse is None:
        best_rmse = raw_rmse
        best_rmse_arg = r
    elif raw_rmse < best_rmse:
        best_rmse = raw_rmse
        best_rmse_arg = r
    results[r] = round(raw_rmse, 2)


print(f"Q4: RMSE {best_rmse} r value: {best_rmse_arg}")

     0 0.34084790341748605
 1e-06 0.3408479061812768
0.0001 0.3408481800544602
 0.001 0.3408506921902752
  0.01 0.3408779300545323
   0.1 0.3412862042012406
     1 0.34489583276460106
     5 0.34773980704851
    10 0.34831498335193445
Q4: RMSE 0.34084790341748605 r value: 0


In [20]:
scores = list()
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)

    # split dataset into 3 parts
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train + n_val]]
    df_test = df.iloc[idx[n_train + n_val:]]

    # reset indexes
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    # Apply log transformation to the 'median_house_value' variable
    y_train= np.log1p(df_train['median_house_value'])
    y_val = np.log1p(df_val['median_house_value'])
    y_test = np.log1p(df_test['median_house_value'])

    # drop target variable from the dataset
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']

    X_train_seed = prepare_data(df_train, 0)

    w0, w = train_linear_regression(X_train_seed, y_train)

    X_val_seed = prepare_data(df_val, 0)
    y_pred = w0 + X_val_seed.dot(w)
    scores.append(rmse(y_val, y_pred))
    print('for seed =', seed, 'score =', scores[seed], '\n')


print(f"Q5 : {round(np.std(scores), 3)}")

for seed = 0 score = 0.33773871601025524 

for seed = 1 score = 0.33779993536588276 

for seed = 2 score = 0.3384287006776504 

for seed = 3 score = 0.33200494683036935 

for seed = 4 score = 0.33944518625587355 

for seed = 5 score = 0.34338197052874636 

for seed = 6 score = 0.3385330211767324 

for seed = 7 score = 0.34687476972787956 

for seed = 8 score = 0.3512736865960519 

for seed = 9 score = 0.3341558266504128 

Q5 : 0.005


In [23]:
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = np.log1p(df_train.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train['median_house_value']
del df_test['median_house_value']

X_train = prepare_data(df_train, 0)
w0, w = train_linear_regression_reg(X_train, y_train, r=0.001)

X_test = prepare_data(df_test, 0)
y_pred = w0 + X_test.dot(w)


print(f"Q6 : Test RMSE {round(rmse(y_test, y_pred), 2)}")

Q6 : Test RMSE 0.33
