# ML Zoomcamp 
## Homework #2

Housing Prices Prediction

In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
print(pd.__version__)

2.0.3


In [75]:
df = pd.read_csv('./data/housing.csv')
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [76]:
df.shape

(20640, 10)

First, keep only the records where ocean_proximity is either '<1H OCEAN' or 'INLAND'

Next, use only the following columns:

'latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value'

In [77]:
ocean_condition = df['ocean_proximity'].isin(['INLAND', '<1H OCEAN'])
df = df[ocean_condition]
cols = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']
df = df[cols]
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-121.97,37.64,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0
1,-121.99,37.61,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0
2,-121.97,37.57,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0
3,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0
4,-121.98,37.58,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0


Q1) There's one feature with missing values. What is it?

A1)  total_bedrooms

In [78]:
df.isna().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms         True
population            False
households            False
median_income         False
median_house_value    False
dtype: bool

Q2) What's the median (50% percentile) for variable 'population'?

A2) 1195

In [79]:
df['population'].describe(percentiles=[0.5])

count    15687.000000
mean      1466.317205
std       1180.389908
min          3.000000
50%       1195.000000
max      35682.000000
Name: population, dtype: float64

In [80]:
df['population'].median()

1195.0

In [90]:
X = df.drop(['median_house_value'], axis=1)
y = np.log1p(df['median_house_value'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9412, 8), (6275, 8), (9412,), (6275,))

In [91]:
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
X_val.shape, X_test.shape, y_val.shape, y_test.shape

((3137, 8), (3138, 8), (3137,), (3138,))

Q3) Which option to fill NA is better? 

A) With mean -- has lower error between the two options of filling with 0 or filling with mean.

In [92]:
def prepare_data(X, fillnatype='zero'):
    X = X.copy()
    if fillnatype == 'zero':        
        X['total_bedrooms'] = X['total_bedrooms'].fillna(0)
    elif fillnatype == 'mean':
        X['total_bedrooms'] = X['total_bedrooms'].fillna(X['total_bedrooms'].mean())
    return X


In [93]:
X_train = prepare_data(X_train, fillnatype='zero')
X_val = prepare_data(X_val, fillnatype='zero')
X_test = prepare_data(X_test, fillnatype='zero')


In [94]:
linear = LinearRegression()
linear.fit(X_train, y_train)
y_pred = linear.predict(X_val)

score = np.round(mean_squared_error(y_val, y_pred, squared=False),2)
score

0.35

In [72]:
# X_train = prepare_data(X_train, fillnatype='mean')
# X_val = prepare_data(X_val, fillnatype='mean')
# X_test = prepare_data(X_test, fillnatype='mean')

# linear = LinearRegression()
# linear.fit(X_train, y_train)
# y_pred = linear.predict(X_val)

# score = np.round(mean_squared_error(y_val, y_pred, squared=False),2)
# score

67026.79

In [95]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

Q4) Which regularization gives the smallest RMSE

A4) Closest answer is 0.001 with RMSE of 67119.09

0.001 -3165724.380819675 67119.09

In [98]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    score = np.round(mean_squared_error(y_val, y_pred, squared=False),2)
    print(r, w_0, score)

0 -10.381779696590845 0.35
1e-06 -10.381757932757168 0.35
0.0001 -10.379603751764153 0.35
0.001 -10.360061206490633 0.35
0.01 -10.168607203958512 0.35
0.1 -8.582470143774094 0.35
1 -3.3515042000978923 0.35
5 -0.9016691555389309 0.35
10 -0.46992232074089646 0.35


Q5) What's the value of std using different seed values

A5) 0.007 which is closer to 0.005

In [99]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
scores = []
for seed in seeds:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=seed)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=seed)
    X_train = prepare_data(X_train, fillnatype='zero')
    X_val = prepare_data(X_val, fillnatype='zero')
    w_0, w = train_linear_regression_reg(X_train, y_train, r=0.0)
    y_pred = w_0 + X_val.dot(w)
    score = np.round(mean_squared_error(y_val, y_pred, squared=False),2)
    scores.append(score)

print(scores)
score_std = np.round(np.std(scores),3)
print(score_std)


[0.34, 0.33, 0.34, 0.35, 0.33, 0.34, 0.34, 0.33, 0.34, 0.35]
0.007


Q6) RMSE on test set

A6) 0.34 which is close to 0.33

In [100]:
seed = 9
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])
X_train_val.shape, y_train_val.shape

((12549, 8), (12549,))

In [103]:
X_train_val = prepare_data(X_train_val, fillnatype='zero')
X_test = prepare_data(X_test, fillnatype='zero')
w_0, w = train_linear_regression_reg(X_train_val, y_train_val, r=0.001)
y_pred = w_0 + X_test.dot(w)
score = np.round(mean_squared_error(y_test, y_pred, squared=False),2)
print(score)


0.34
