## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

%matplotlib inline

## read the dataset

In [2]:
df = pd.read_csv('C:\Projects\datasets\housing.csv')

## Exploratory Data Analysis

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


##  keep only the records where ocean_proximity is either '<1H OCEAN' or 'INLAND'

In [4]:
options = ['<1H OCEAN','INLAND']
df = df[df['ocean_proximity'].isin(options)]
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
701,-121.97,37.64,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0,<1H OCEAN
830,-121.99,37.61,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0,<1H OCEAN
859,-121.97,37.57,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0,<1H OCEAN
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0,<1H OCEAN
861,-121.98,37.58,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0,<1H OCEAN


## Question 1: Columns with missing values

In [5]:
missing_values = df.isnull()
missing_counts = missing_values.sum()
print(missing_counts[missing_counts > 0].index.tolist())

['total_bedrooms']


## Question 2: What's the median (50% percentile) for variable 'population'

In [6]:
median = df['population'].median()
median

1195.0

## Validation Framework

In [28]:
# Prepare and split the dataset
np.random.seed(42)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [30]:
## Apply the log transformation to the median_house_value variable using the np.log1p() function.
y_train_orig = df_train.median_house_value.values # .values converts DataFrame to NumPy array
y_val_orig = df_val.median_house_value.values
y_test_orig = df_test.median_house_value.values

y_train = np.log1p(y_train_orig)
y_val = np.log1p(y_val_orig)
y_test = np.log1p(y_test_orig)

In [34]:
# remove median_house_value' to avoid potentially using it during training

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

KeyError: 'median_house_value'

## Linear Regression

In [43]:
check=["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"]

def prepare_X(df, fillna_value):
    df_num = df[check]
    df_num = df_num.fillna(fillna_value)
    X = df_num.values
    return X

# linear regression algorithm
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    print(w)
    return w[0], w[1:]

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [51]:
# missing values
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

## Question 3

In [49]:
# We have two options to fill in the missing values: 

# option 1: fill it with 0 
X_null_train = prepare_X(df_train, fillna_value=0)
w_0_null, w_null = train_linear_regression(X_null_train, y_train)

X_null_val = prepare_X(df_val, fillna_value=0)
y_null_pred_val = w_0_null + X_null_val.dot(w_null)

np.round(rmse(y_val, y_null_pred_val),2)

[-9.76324948e+00 -2.55938802e-01 -2.68233309e-01  1.77710730e-03
 -3.38393956e-05  2.93766352e-04 -1.52190447e-04  3.96908643e-04
  1.80600946e-01]


0.34

In [50]:
# option 2: fill with the mean 
mean = df_train.total_bedrooms.mean()

X_mean_train = prepare_X(df_train, fillna_value=mean)
w_0_mean, w_mean = train_linear_regression(X_mean_train, y_train)
     
X_mean_val = prepare_X(df_val, fillna_value=mean)
y_mean_pred_val = w_0_mean + X_mean_val.dot(w_mean)
     
np.round(rmse(y_val, y_mean_pred_val),2)

[-9.88613322e+00 -2.57026361e-01 -2.68652914e-01  1.79880767e-03
 -3.87631029e-05  3.78806885e-04 -1.49641599e-04  3.25344392e-04
  1.81951358e-01]


0.34

## Regularization

In [52]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [53]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(X_null_train, y_train, r=r)
    y_null_reg_val = w_0 + X_null_val.dot(w)
    rmse_val = np.round(rmse(y_val, y_null_reg_val),2)
    print(r, w_0, rmse_val)

0 -9.763249478233956 0.34
1e-06 -9.763228831251178 0.34
0.0001 -9.761185235641998 0.34
0.001 -9.742646249773415 0.34
0.01 -9.561056193060958 0.34
0.1 -8.058889769717444 0.34
1 -3.1331542785800472 0.34
5 -0.8410867975089132 0.35
10 -0.4381172315736762 0.35


## Question 5

In [59]:
rmse_list = []

for r in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:

    idx = np.arange(n)
    np.random.seed(r)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]
    
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    y_train_orig = df_train.median_house_value.values
    y_val_orig = df_val.median_house_value.values
    y_test_orig = df_test.median_house_value.values

    y_train = np.log1p(y_train_orig)
    y_val = np.log1p(y_val_orig)
    y_test = np.log1p(y_test_orig)
    
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    X_null_train = prepare_X(df_train, fillna_value=0)
    w_0, w = train_linear_regression(X_null_train, y_train)
    
    X_null_val = prepare_X(df_val, fillna_value=0)
    y_null_reg_val = w_0 + X_null_val.dot(w)
    rmse_val = np.round(rmse(y_val, y_null_reg_val),2)
    
    rmse_list.append(rmse_val)
    
    print(r, w_0, rmse_val)

[-9.87006897e+00 -2.57905892e-01 -2.71538872e-01  1.43874092e-03
 -3.28506955e-05  2.74955467e-04 -1.50515777e-04  4.07441495e-04
  1.79439347e-01]
0 -9.870068970757938 0.34
[-9.11401121e+00 -2.50366564e-01 -2.67111799e-01  1.68367550e-03
 -2.17079992e-05  2.74621608e-04 -2.07918083e-04  4.81989214e-04
  1.78044406e-01]
1 -9.114011209778935 0.34
[-9.91156020e+00 -2.57559087e-01 -2.69914047e-01  1.92264005e-03
 -3.40679630e-05  3.03040321e-04 -1.49763515e-04  3.88143978e-04
  1.82156954e-01]
2 -9.911560200645035 0.34
[-1.02331837e+01 -2.61342089e-01 -2.73049563e-01  1.84153315e-03
 -2.95482594e-05  2.64770777e-04 -1.48513120e-04  3.88427823e-04
  1.79059417e-01]
3 -10.23318368136809 0.33
[-9.17476345e+00 -2.48890011e-01 -2.62029994e-01  2.27919844e-03
 -3.57206479e-05  3.08606461e-04 -1.41923501e-04  3.59844023e-04
  1.84951730e-01]
4 -9.174763450685518 0.34
[-1.00066483e+01 -2.59279840e-01 -2.72013099e-01  1.22123363e-03
 -2.59405744e-05  2.80305812e-04 -1.49461193e-04  3.60332515e-04


In [57]:
# inspect rmse_list
rmse_list

[0.34, 0.34, 0.34, 0.33, 0.34, 0.34, 0.34, 0.35, 0.35, 0.33]

In [58]:
# round to 3 decimal places
np.round(np.std(rmse_list),3)

0.006

## Seed and Regularization

In [62]:
r = 9

idx = np.arange(n)
np.random.seed(r)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]
    
df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

frames = [df_train, df_val]
df_train_val = pd.concat(frames)

df_train_val = df_train_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train_val_orig = df_train_val.median_house_value.values
y_test_orig = df_test.median_house_value.values

y_train_val = np.log1p(y_train_val_orig)
y_test = np.log1p(y_test_orig)

del df_train_val['median_house_value']
del df_test['median_house_value']

In [63]:
# inpect rmse

X_null_train_val = prepare_X(df_train_val, fillna_value=0)
w_0_train_val, w_train_val = train_linear_regression_reg(X_null_train_val, y_train_val, r=0.001)

X_null_test = prepare_X(df_test, fillna_value=0)
y_null_pred_test = w_0_train_val + X_null_test.dot(w_train_val)

np.round(rmse(y_test, y_null_pred_test),2)

0.33