In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv('housing.csv')

In [4]:
df = df.fillna(0)

In [5]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [6]:
df = df.copy()
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']


In [7]:
df['ocean_proximity'].mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [10]:
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [14]:
house_mean_value = df_full_train.median_house_value.mean()
house_mean_value

207194.6937378876

In [15]:
df_full_train['above_average'] = df_full_train.median_house_value >= house_mean_value


In [179]:
df.above_average = (df.above_average == True).astype(int)


In [180]:
df_full_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,103000.0,NEAR OCEAN,5.017657,0.200576,3.691814,0
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,382100.0,NEAR OCEAN,4.473545,0.232703,1.738095,1
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,172600.0,NEAR OCEAN,5.645833,0.174486,2.723214,0
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,93400.0,NEAR OCEAN,4.002817,0.258269,3.994366,0
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0,INLAND,6.268421,0.18094,2.3,0


In [18]:
df_full_train.above_average.value_counts(normalize=True)


0    0.593144
1    0.406856
Name: above_average, dtype: float64

In [19]:
df_full_train.corr()

# households and total_bedrooms have the biggest correlation

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household,above_average
longitude,1.0,-0.924485,-0.101818,0.038676,0.063064,0.094276,0.049306,-0.01704,-0.046349,-0.029339,0.09728,-0.000598,-0.074333
latitude,-0.924485,1.0,0.005296,-0.029224,-0.059998,-0.102499,-0.064061,-0.076571,-0.142983,0.110695,-0.118938,0.005837,-0.097202
housing_median_age,-0.101818,0.005296,1.0,-0.360922,-0.320624,-0.292283,-0.302796,-0.121711,0.103706,-0.160892,0.135495,0.016245,0.077366
total_rooms,0.038676,-0.029224,-0.360922,1.0,0.930489,0.857936,0.920482,0.198268,0.133989,0.13609,-0.189316,-0.024991,0.129528
total_bedrooms,0.063064,-0.059998,-0.320624,0.930489,1.0,0.878932,0.980255,-0.009141,0.04798,-0.001659,0.084149,-0.028536,0.057973
population,0.094276,-0.102499,-0.292283,0.857936,0.878932,1.0,0.907452,0.004122,-0.026032,-0.073733,0.035134,0.07233,-0.014167
households,0.049306,-0.064061,-0.302796,0.920482,0.980255,0.907452,1.0,0.012776,0.063714,-0.083062,0.064185,-0.027656,0.075293
median_income,-0.01704,-0.076571,-0.121711,0.198268,-0.009141,0.004122,0.012776,1.0,0.690647,0.336013,-0.616669,0.022061,0.542839
median_house_value,-0.046349,-0.142983,0.103706,0.133989,0.04798,-0.026032,0.063714,0.690647,1.0,0.158485,-0.257419,-0.02203,0.804533
rooms_per_household,-0.029339,0.110695,-0.160892,0.13609,-0.001659,-0.073733,-0.083062,0.336013,0.158485,1.0,-0.435169,-0.004922,0.116057


In [20]:
from sklearn.metrics import mutual_info_score

In [21]:
mutual_info_score(df_full_train.ocean_proximity, df_full_train.above_average)

0.1014306752368672

In [85]:
house_mean_value = df.median_house_value.mean()
house_mean_value

206855.81690891474

In [171]:
df['above_average'] = df.median_house_value >= house_mean_value

In [176]:
df.above_average = (df.above_average == True).astype(int)

In [91]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [92]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [94]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [95]:
train_dict = df_train.to_dict(orient='records')
train_dict[0]

{'longitude': -119.67,
 'latitude': 34.43,
 'housing_median_age': 39.0,
 'total_rooms': 1467.0,
 'total_bedrooms': 381.0,
 'population': 1404.0,
 'households': 374.0,
 'median_income': 2.3681,
 'ocean_proximity': '<1H OCEAN',
 'rooms_per_household': 3.9224598930481283,
 'bedrooms_per_room': 0.25971370143149286,
 'population_per_household': 3.7540106951871657}

In [96]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

In [98]:

X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)


In [51]:
from sklearn.linear_model import LogisticRegression

In [99]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [103]:
# coefficients are the weights (w)
model.coef_[0].round(3)

array([ 0.223,  0.004,  0.036,  0.126,  0.09 ,  1.21 ,  0.476, -1.738,
        0.023,  0.275,  0.868, -0.002,  0.01 , -0.021,  0.002, -0.   ])

In [124]:
# intercept is the bias term (w0)
model.intercept_[0]

-0.09680252423459645

In [104]:
y_pred = model.predict_proba(X_val)[:, 1]

In [107]:
y_val

array([0, 0, 1, ..., 1, 1, 0])

In [108]:
y_pred

array([0.07786319, 0.18583189, 0.95140425, ..., 0.96166254, 0.85034795,
       0.47081981])

In [109]:
above_average = (y_pred >= 0.5)

In [110]:
df_val[above_average]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
10838,-117.92,33.67,14.0,6224.0,1679.0,3148.0,1589.0,4.2071,<1H OCEAN,3.916929,0.269762,1.981120
10767,-117.90,33.63,28.0,2370.0,352.0,832.0,347.0,7.1148,<1H OCEAN,6.829971,0.148523,2.397695
10394,-117.67,33.54,16.0,2102.0,350.0,1003.0,328.0,4.7981,<1H OCEAN,6.408537,0.166508,3.057927
17311,-119.49,34.39,17.0,4617.0,982.0,2303.0,923.0,3.9224,NEAR OCEAN,5.002167,0.212692,2.495125
5758,-118.28,34.17,22.0,2664.0,651.0,1553.0,629.0,3.6354,<1H OCEAN,4.235294,0.244369,2.468998
...,...,...,...,...,...,...,...,...,...,...,...,...
10165,-117.94,33.91,18.0,8836.0,1527.0,3946.0,1451.0,5.6441,<1H OCEAN,6.089593,0.172816,2.719504
1381,-122.11,38.00,9.0,3424.0,583.0,1460.0,543.0,5.7600,NEAR BAY,6.305709,0.170269,2.688766
16904,-122.35,37.58,30.0,5039.0,1564.0,2129.0,1536.0,3.3469,NEAR OCEAN,3.280599,0.310379,1.386068
18139,-122.01,37.34,31.0,3080.0,526.0,1493.0,582.0,6.3052,<1H OCEAN,5.292096,0.170779,2.565292


In [111]:
(y_val == above_average).mean()
## accuracy of predictions

0.8369670542635659

In [112]:
above_average

array([False, False,  True, ...,  True,  True, False])

In [113]:
y_val

array([0, 0, 1, ..., 1, 1, 0])

In [114]:
above_average.astype(int)

array([0, 0, 1, ..., 1, 1, 0])

In [117]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = above_average.astype(int)
df_pred['actual'] = y_val
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [118]:
df_pred

Unnamed: 0,probability,prediction,actual,correct
0,0.077863,0,0,True
1,0.185832,0,0,True
2,0.951404,1,1,True
3,0.495577,0,1,False
4,0.980074,1,1,True
...,...,...,...,...
4123,0.046664,0,0,True
4124,0.987964,1,1,True
4125,0.961663,1,1,True
4126,0.850348,1,1,True


In [121]:
dict(zip(dv.get_feature_names(), model.coef_[0].round(3)))

{'bedrooms_per_room': 0.223,
 'households': 0.004,
 'housing_median_age': 0.036,
 'latitude': 0.126,
 'longitude': 0.09,
 'median_income': 1.21,
 'ocean_proximity=<1H OCEAN': 0.476,
 'ocean_proximity=INLAND': -1.738,
 'ocean_proximity=ISLAND': 0.023,
 'ocean_proximity=NEAR BAY': 0.275,
 'ocean_proximity=NEAR OCEAN': 0.868,
 'population': -0.002,
 'population_per_household': 0.01,
 'rooms_per_household': -0.021,
 'total_bedrooms': 0.002,
 'total_rooms': -0.0}

In [122]:
small = ['total_rooms', 'total_bedrooms', 'population', 'households']

dicts_train_small = df_train[small].to_dict(orient='records')
dicts_val_small = df_val[small].to_dict(orient='records')

dv_small = DictVectorizer(sparse=False)
dv_small.fit(dicts_train_small)

In [123]:
X_train_small = dv_small.transform(dicts_train_small)

model_small = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model_small.fit(X_train_small, y_train)

In [125]:
# intercept is bias term
w0 = model_small.intercept_[0]
w0

-0.4393981157473192

In [126]:
# coefficients are the weights
w = model_small.coef_[0]
w.round(3)

array([ 0.009, -0.002, -0.008,  0.001])

In [128]:
dict(zip(dv_small.get_feature_names_out(), w.round(3)))

{'households': 0.009,
 'population': -0.002,
 'total_bedrooms': -0.008,
 'total_rooms': 0.001}

In [131]:
X_val_small = dv_small.transform(dicts_val_small)
y_pred = model_small.predict_proba(X_val_small)[:, 1]

In [132]:
above_average = (y_pred >= 0.5)
(y_val == above_average).mean()
## accuracy of full small feature set - predictions

0.7095445736434108

In [201]:
small = ['total_rooms', 'total_bedrooms', 'population', 'households']
orig = 0.7095445736434108
for i in range(len(small)):
    feature = small.pop()
    dicts_train = df_train[small].to_dict(orient='records')
    dicts_val = df_val[small].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(dicts_train)
    X_train = dv.transform(dicts_train)
    X_val = dv.transform(dicts_val)
    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    above_average = (y_pred >= 0.5)
    accuracy = (y_val == above_average).mean()
    print(f'w/o {feature}, accuracy is {accuracy}. difference between original is {round(orig - accuracy, 3)}')
    small.insert(0, feature)
    

w/o households, accuracy is 0.6719961240310077. difference between original is 0.038
w/o population, accuracy is 0.656734496124031. difference between original is 0.053
w/o total_bedrooms, accuracy is 0.6608527131782945. difference between original is 0.049
w/o total_rooms, accuracy is 0.6276647286821705. difference between original is 0.082


In [142]:
# linear regression
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']


In [146]:
from sklearn.linear_model import Ridge

In [170]:
dv_ridge = DictVectorizer(sparse=False)

train_dict = df_train[['ocean_proximity']].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[['ocean_proximity']].to_dict(orient='records')
X_val = dv.transform(val_dict)


def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

alphas = [0, 0.01, 0.1, 1, 10]
for a in alphas:
    
    model_ridge = Ridge(alpha=a, solver="sag", random_state=42)
    model_ridge.fit(X_train, y_train)
    
    w0 = model_ridge.intercept_
    w = model_ridge.coef_
    y_pred = w0 + X_val.dot(w)
    print(f"{a:>5} {round(rmse(y_val, y_pred), 6)}")

    0 0.469896
 0.01 0.469895
  0.1 0.469889
    1 0.469835
   10 0.469877
