# Week 3 Homework

In [1]:
import numpy as np
import pandas as pd

### Dataset:

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'
fname = 'housing.csv'

In [3]:
# !wget $data -O $fname

### Features:

In [4]:
features = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value',
    'ocean_proximity',
]

### Data Preparation:

In [5]:
df = pd.read_csv(fname)
df = df[features]
df.head().T

Unnamed: 0,0,1,2,3,4
latitude,37.88,37.86,37.85,37.85,37.85
longitude,-122.23,-122.22,-122.24,-122.25,-122.25
housing_median_age,41.0,21.0,52.0,52.0,52.0
total_rooms,880.0,7099.0,1467.0,1274.0,1627.0
total_bedrooms,129.0,1106.0,190.0,235.0,280.0
population,322.0,2401.0,496.0,558.0,565.0
households,126.0,1138.0,177.0,219.0,259.0
median_income,8.3252,8.3014,7.2574,5.6431,3.8462
median_house_value,452600.0,358500.0,352100.0,341300.0,342200.0
ocean_proximity,NEAR BAY,NEAR BAY,NEAR BAY,NEAR BAY,NEAR BAY


In [6]:
# For my own sanity
df['ocean_proximity'] = df['ocean_proximity'].str.lower().str.replace(' ', '_')
df['ocean_proximity']

0        near_bay
1        near_bay
2        near_bay
3        near_bay
4        near_bay
           ...   
20635      inland
20636      inland
20637      inland
20638      inland
20639      inland
Name: ocean_proximity, Length: 20640, dtype: object

Filling missing value with 0:

In [7]:
df.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

Only `total_bedrooms` needs it:

In [8]:
df.total_bedrooms = df.total_bedrooms.fillna(0)

In [9]:
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']

### Question 1:

In [10]:
df.ocean_proximity.value_counts()

<1h_ocean     9136
inland        6551
near_ocean    2658
near_bay      2290
island           5
Name: ocean_proximity, dtype: int64

Most frequent observation is **<1H OCEAN**

### Question 2:

In [11]:
df.median_house_value

0        452600.0
1        358500.0
2        352100.0
3        341300.0
4        342200.0
           ...   
20635     78100.0
20636     77100.0
20637     92300.0
20638     84700.0
20639     89400.0
Name: median_house_value, Length: 20640, dtype: float64

In [12]:
df['above_average'] = (df['median_house_value'] > df['median_house_value'].mean()).astype(int)
print(df['above_average'].nunique())
df['above_average']

2


0        1
1        1
2        1
3        1
4        1
        ..
20635    0
20636    0
20637    0
20638    0
20639    0
Name: above_average, Length: 20640, dtype: int64

In [13]:
# Getting list of numerical values sans target
from copy import deepcopy
numerical = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'rooms_per_household', 
    'bedrooms_per_room', 
    'population_per_household',
]
numerical

['latitude',
 'longitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'rooms_per_household',
 'bedrooms_per_room',
 'population_per_household']

In [14]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

y_train_raw = df_train.median_house_value.values
y_val_raw = df_val.median_house_value.values
y_test_raw = df_test.median_house_value.values

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [15]:
df_train[numerical].corr()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
latitude,1.0,-0.925005,0.002477,-0.025914,-0.05973,-0.100272,-0.063529,-0.076805,0.119118,-0.124507,-0.002301
longitude,-0.925005,1.0,-0.099812,0.036449,0.06384,0.09167,0.049762,-0.016426,-0.034814,0.10232,0.011022
housing_median_age,0.002477,-0.099812,1.0,-0.363522,-0.324156,-0.292476,-0.306119,-0.119591,-0.181275,0.129456,0.012167
total_rooms,-0.025914,0.036449,-0.363522,1.0,0.931546,0.853219,0.921441,0.198951,0.168926,-0.194185,-0.029452
total_bedrooms,-0.05973,0.06384,-0.324156,0.931546,1.0,0.87734,0.979399,-0.009833,0.010381,0.078094,-0.034301
population,-0.100272,0.09167,-0.292476,0.853219,0.87734,1.0,0.906841,-0.000849,-0.07621,0.031592,0.064998
households,-0.063529,0.049762,-0.306119,0.921441,0.979399,0.906841,1.0,0.011925,-0.085832,0.058004,-0.032522
median_income,-0.076805,-0.016426,-0.119591,0.198951,-0.009833,-0.000849,0.011925,1.0,0.394154,-0.616617,-0.000454
rooms_per_household,0.119118,-0.034814,-0.181275,0.168926,0.010381,-0.07621,-0.085832,0.394154,1.0,-0.500589,0.001801
bedrooms_per_room,-0.124507,0.10232,0.129456,-0.194185,0.078094,0.031592,0.058004,-0.616617,-0.500589,1.0,-0.002851


Ans:
* **`total_bedrooms` and `households`: 0.979399**
* `total_bedrooms` and `total_rooms`: 0.931546
* `population` and `households`: 0.906841
* `population_per_household` and `total_rooms`: -0.029452

Answer is **`total_bedrooms` and `households`**

### Question 3:

In [16]:
from sklearn.metrics import mutual_info_score

categorical = ['ocean_proximity']

mutual_info_score(df_train[categorical[0]], y_train)

0.10138385763624205

Ans:
- 0.26
- 0
- **0.10**
- 0.16

Answer is 0.10

### Question 4:

In [17]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

# dict vectorizer handles one-hot encoding of ocean_proximity
train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [18]:
# Checking that ocean_proximity was handled
dv.get_feature_names_out()

array(['bedrooms_per_room', 'households', 'housing_median_age',
       'latitude', 'longitude', 'median_income',
       'ocean_proximity=<1h_ocean', 'ocean_proximity=inland',
       'ocean_proximity=island', 'ocean_proximity=near_bay',
       'ocean_proximity=near_ocean', 'population',
       'population_per_household', 'rooms_per_household',
       'total_bedrooms', 'total_rooms'], dtype=object)

In [19]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train) 

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [20]:
# Accuracy:
y_pred = model.predict_proba(X_val)[:, 1]

house_decision = (y_pred >= 0.5).astype(int)
house_decision

array([0, 0, 1, ..., 1, 1, 0])

In [21]:
acc = (y_val == house_decision).mean()
round(acc, 2)

0.84

Ans:
- 0.60
- 0.72
- **0.84**
- 0.95

Ans is : 0.84

### Question 5:

In [22]:
features = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'rooms_per_household', 
    'bedrooms_per_room', 
    'population_per_household',
    'ocean_proximity',
]

In [23]:
acc_dict = {}
for feature in features:
    feature_elim = features[:]
    feature_elim.remove(feature)

    dv = DictVectorizer(sparse=False)

    train_dict = df_train[feature_elim].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[feature_elim].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train) 

    y_pred = model.predict_proba(X_val)[:, 1]
    house_decision = (y_pred >= 0.5).astype(int)
    elim_acc = (y_val == house_decision).mean()

    acc_dict[feature] = abs(acc - elim_acc)

    print(f"Feature removed: {feature}, accuracy difference: {round(acc - elim_acc, 8)}")

Feature removed: latitude, accuracy difference: 0.00169574
Feature removed: longitude, accuracy difference: 0.00460271
Feature removed: housing_median_age, accuracy difference: 0.00460271
Feature removed: total_rooms, accuracy difference: -0.00290698
Feature removed: total_bedrooms, accuracy difference: -0.00218023
Feature removed: population, accuracy difference: 0.00920543
Feature removed: households, accuracy difference: 0.00218023
Feature removed: median_income, accuracy difference: 0.0499031
Feature removed: rooms_per_household, accuracy difference: 0.00121124
Feature removed: bedrooms_per_room, accuracy difference: 0.0
Feature removed: population_per_household, accuracy difference: 0.0
Feature removed: ocean_proximity, accuracy difference: 0.01453488


In [24]:
import collections

sorted(acc_dict.items(), key=lambda item: item[1], reverse=False) # Sorts ascending

[('bedrooms_per_room', 0.0),
 ('population_per_household', 0.0),
 ('rooms_per_household', 0.001211240310077466),
 ('latitude', 0.0016957364341084746),
 ('households', 0.0021802325581394832),
 ('total_bedrooms', 0.0021802325581395943),
 ('total_rooms', 0.0029069767441860517),
 ('longitude', 0.004602713178294526),
 ('housing_median_age', 0.004602713178294526),
 ('population', 0.009205426356589164),
 ('ocean_proximity', 0.014534883720930258),
 ('median_income', 0.049903100775193776)]

Ans:
   * `total_rooms`
   * `total_bedrooms` 
   * `population`
   * **`households`**

Ans is **`households`**

### Question 6:

In [33]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

y_val_raw

array([11.47937902, 11.23190118, 12.97363364, ..., 12.74898107,
       12.8682476 , 12.12378283])

In [34]:
y_train_raw = np.log1p(y_train_raw)
y_val_raw = np.log1p(y_val_raw)

In [35]:
dv = DictVectorizer(sparse=False)

# dict vectorizer handles one-hot encoding of ocean_proximity
train_dict = df_train[feature_elim].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[feature_elim].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [36]:
accuracies = {}
alphas = [0, 0.01, 0.1, 1, 10]
for a in alphas:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train_raw)

    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val_raw, y_pred)

    accuracies[a] = rmse

In [40]:
for key, value in sorted(accuracies.items(), key=lambda item: item[1], reverse=True): # Sorts descending
    print(f"alpha: {key}, rmse: {round(value, 3)}")

alpha: 10, rmse: 0.002
alpha: 1, rmse: 0.002
alpha: 0.1, rmse: 0.002
alpha: 0.01, rmse: 0.002
alpha: 0, rmse: 0.002


RMSE does not differ significantly for each alpha. Going with lowest alpha of those listed.

Ans: **0**