In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#reading in the data
cols_list = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value'] #this is the list of specific features to be read from the data

df = pd.read_csv('housing.csv', usecols=cols_list)

In [3]:
#Dimensions of the dataset
df.shape

(20640, 9)

In [4]:
#Description of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [5]:
#sneak peak of the dataset
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [6]:
#Checking missing values
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [8]:
#Median of the population column
np.median(df.population)

1166.0

## Validation Framework

In [9]:
np.random.seed(42)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [10]:
y_train_orig = df_train['median_house_value'].values
y_val_orig = df_val['median_house_value'].values
y_test_orig = df_test['median_house_value'].values

y_train = np.log1p(df_train['median_house_value'].values)
y_val = np.log1p(df_val['median_house_value'].values)
y_test = np.log1p(df_test['median_house_value'].values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

## Linear Regression

In [11]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

### Baseline Solution

In [12]:
base = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income']

In [13]:
#Fill missing values with 0
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [15]:
X_zero_train = prepare_X(df_train)
w_zero_0, w_zero = train_linear_regression(X_zero_train, y_train)

In [31]:
y_pred_zero = w_zero_0 + X_zero_train.dot(w_zero)

In [17]:
def rmse(y, y_hat):
    error = y_hat - y
    mse = (error ** 2).mean()
    score = np.sqrt(mse)
    return round(score, 2)

In [30]:
X_val_zero = prepare_X(df_val)
y_pred_zero = w_zero_0 + X_val_zero.dot(w_zero)

In [32]:
rmse(y_train, y_pred_zero)

0.34

In [25]:
rmse(y_val, y_pred_zero)

0.33

In [33]:
#Fill in missing values with the mean
mean_bed = df_train['total_bedrooms'].mean()

In [35]:
def prepare_X_mean(df):
    df_num = df[base]
    df_num = df_num.fillna(mean_bed)
    X = df_num.values
    return X

In [36]:
X_mean_train = prepare_X(df_train)
w_mean_0, w_mean = train_linear_regression(X_mean_train, y_train)

In [None]:
y_pred_mean = w_mean_0 + X_mean_train.dot(w_mean)

In [37]:
X_val_mean = prepare_X(df_val)
y_pred_mean = w_mean_0 + X_val_mean.dot(w_mean)

In [38]:
rmse(y_val, y_pred_mean)

0.33

### Regularization

In [39]:
params = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

In [40]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [41]:
X_zero_train = prepare_X(df_train)
w_zero_0, w_zero = train_linear_regression_reg(X_zero_train, y_train)

In [43]:
for r in params:
    X_zero_train = prepare_X(df_train)
    w_zero_0, w_zero = train_linear_regression_reg(X_zero_train, y_train, r=r)

    X_zero_val = prepare_X(df_val)
    y_pred_zero = w_zero_0 + X_zero_val.dot(w_zero)
    score = rmse(y_val, y_pred_zero)
    
    print(r, w_zero_0, score)

0 -11.68697524191182 0.33
1e-06 -11.686959176106452 0.33
0.0001 -11.685368865381463 0.33
0.001 -11.670931318209552 0.33
0.01 -11.528493585718895 0.33
0.1 -10.274500282155714 0.33
1 -4.920480897785018 0.33
5 -1.4820957456205364 0.34
10 -0.789931183220496 0.34


## Running Linear Regression with random seed = 9

In [44]:
np.random.seed(9)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [45]:
y_train_orig = df_train['median_house_value'].values
y_val_orig = df_val['median_house_value'].values
y_test_orig = df_test['median_house_value'].values

y_train = np.log1p(df_train['median_house_value'].values)
y_val = np.log1p(df_val['median_house_value'].values)
y_test = np.log1p(df_test['median_house_value'].values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [46]:
df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)

In [47]:
X_full_train = prepare_X(df_full_train)

In [48]:
y_full_train = np.concatenate([y_train, y_val])

In [49]:
w_0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

In [50]:
X_test = prepare_X(df_test)
y_pred = w_0 + X_test.dot(w)
score = rmse(y_test, y_pred)
score

0.35

## Split Data Randomly with Different Seeds

In [51]:
def random_split_data(df, s):
    np.random.seed(s)

    n = len(df)

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()
    
    y_train_orig = df_train['median_house_value'].values
    y_val_orig = df_val['median_house_value'].values
    y_test_orig = df_test['median_house_value'].values

    y_train = np.log1p(df_train['median_house_value'].values)
    y_val = np.log1p(df_val['median_house_value'].values)
    y_test = np.log1p(df_test['median_house_value'].values)

    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression(X_train, y_train)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    return score

In [55]:
list_score = []
for s in range(10):
    list_score.append(random_split_data(df, s))
list_score

[0.34, 0.34, 0.33, 0.34, 0.34, 0.34, 0.35, 0.34, 0.35, 0.34]

In [57]:
round(np.std(list_score), 3)

0.005