In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Dataset

In [2]:
df = pd.read_csv('housing.csv')
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,INLAND
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,INLAND
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,INLAND
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,INLAND


In [3]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [4]:
features = ['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity']

In [5]:
df[features].isna().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [6]:
df.total_bedrooms = df.total_bedrooms.fillna(0)

In [7]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [8]:
df['rooms_per_household'] = df.total_rooms.divide(df.households)
df['bedrooms_per_room'] = df.total_bedrooms.divide(df.total_rooms)
df['population_per_household'] = df.population.divide(df.households)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,near_bay,6.984127,0.146591,2.555556
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,near_bay,6.238137,0.155797,2.109842
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,near_bay,8.288136,0.129516,2.80226
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,near_bay,5.817352,0.184458,2.547945
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,near_bay,6.281853,0.172096,2.181467


### Question 1
What is the most frequent observation (mode) for the column ocean_proximity?

In [9]:
df.ocean_proximity.value_counts()

<1h_ocean     9136
inland        6551
near_ocean    2658
near_bay      2290
island           5
Name: ocean_proximity, dtype: int64

In [10]:
df.ocean_proximity.value_counts().index[0]

'<1h_ocean'

### Question 2
  * Create the correlation matrix for the numerical features of your train dataset.
  * In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
  * What are the two features that have the biggest correlation in this dataset?

In [11]:
df.dtypes

longitude                   float64
latitude                    float64
housing_median_age            int64
total_rooms                   int64
total_bedrooms              float64
population                    int64
households                    int64
median_income               float64
median_house_value            int64
ocean_proximity              object
rooms_per_household         float64
bedrooms_per_room           float64
population_per_household    float64
dtype: object

In [12]:
numerical = ['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'rooms_per_household',
       'bedrooms_per_room', 'population_per_household']

In [13]:
df[numerical].corr().round(2)

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household
latitude,1.0,-0.92,0.01,-0.04,-0.07,-0.11,-0.07,-0.08,-0.14,0.11,-0.1,0.0
longitude,-0.92,1.0,-0.11,0.04,0.07,0.1,0.06,-0.02,-0.05,-0.03,0.08,0.0
housing_median_age,0.01,-0.11,1.0,-0.36,-0.32,-0.3,-0.3,-0.12,0.11,-0.15,0.13,0.01
total_rooms,-0.04,0.04,-0.36,1.0,0.92,0.86,0.92,0.2,0.13,0.13,-0.17,-0.02
total_bedrooms,-0.07,0.07,-0.32,0.92,1.0,0.87,0.97,-0.01,0.05,0.0,0.12,-0.03
population,-0.11,0.1,-0.3,0.86,0.87,1.0,0.91,0.0,-0.02,-0.07,0.03,0.07
households,-0.07,0.06,-0.3,0.92,0.97,0.91,1.0,0.01,0.07,-0.08,0.06,-0.03
median_income,-0.08,-0.02,-0.12,0.2,-0.01,0.0,0.01,1.0,0.69,0.33,-0.57,0.02
median_house_value,-0.14,-0.05,0.11,0.13,0.05,-0.02,0.07,0.69,1.0,0.15,-0.24,-0.02
rooms_per_household,0.11,-0.03,-0.15,0.13,0.0,-0.07,-0.08,0.33,0.15,1.0,-0.39,-0.0


from correlation matrix it is seen that the correlation between households and total_bedrooms is maximum with a correlation value of 0.97

### Make median_house_value binary
  * We need to turn the median_house_value variable from numeric into binary.
  * Let's create a variable above_average which is 1 if the median_house_value is above its mean value and 0 otherwise.

In [14]:
df['above_average'] = (df['median_house_value'] >= df.median_house_value.mean()).astype(int)

In [15]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,near_bay,6.984127,0.146591,2.555556,1
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,near_bay,6.238137,0.155797,2.109842,1
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,near_bay,8.288136,0.129516,2.802260,1
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,near_bay,5.817352,0.184458,2.547945,1
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,near_bay,6.281853,0.172096,2.181467,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,inland,5.045455,0.224625,2.560606,0
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,inland,6.114035,0.215208,3.122807,0
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,inland,5.205543,0.215173,2.325635,0
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,inland,5.329513,0.219892,2.123209,0


### Split the data
  * Split your data in train/val/test sets, with 60%/20%/20% distribution.
  * Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
  * Make sure that the target value (median_house_value) is not in your dataframe.

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
df_full_train, df_test =  train_test_split(df, test_size = 0.2, random_state = 42)
df_train, df_val =  train_test_split(df_full_train, test_size = 0.25, random_state = 42)

In [18]:
len(df), len(df_full_train), len(df_test), len(df_train), len(df_val)

(20640, 16512, 4128, 12384, 4128)

In [19]:
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

### Question 3
  * Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.
  * What is the value of mutual information?
  * Round it to 2 decimal digits using round(score, 2)

In [20]:
from sklearn.metrics import mutual_info_score

In [21]:
def mutual_info_score_binarizedprice(series):
    return mutual_info_score(series, df_train.above_average)

The only categorical variable we have is ocean_proximity. Hence the mutual information score can be calculated as follows

In [22]:
score = df_train.apply(mutual_info_score_binarizedprice)
round(score,2)



longitude                   0.18
latitude                    0.16
housing_median_age          0.01
total_rooms                 0.27
total_bedrooms              0.09
population                  0.18
households                  0.08
median_income               0.52
ocean_proximity             0.10
rooms_per_household         0.65
bedrooms_per_room           0.65
population_per_household    0.64
above_average               0.68
dtype: float64

Thus the mutual information score for the categorical variable 'ocean_proximity' is 0.1

In [78]:
house_price_mean = y_train.mean()
house_price_mean

206807.7419250646

#### Question 4
  * Now let's train a logistic regression
  * Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.
  * Fit the model on the training dataset.
    * To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    * model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
  * Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [23]:
from sklearn.feature_extraction import DictVectorizer

In [24]:
dicts_train = df_train.to_dict(orient = 'records')

In [25]:
dv = DictVectorizer(sparse = False)

In [26]:
dv.fit(dicts_train)

In [27]:
dv.get_feature_names()



['above_average',
 'bedrooms_per_room',
 'households',
 'housing_median_age',
 'latitude',
 'longitude',
 'median_income',
 'ocean_proximity=<1h_ocean',
 'ocean_proximity=inland',
 'ocean_proximity=island',
 'ocean_proximity=near_bay',
 'ocean_proximity=near_ocean',
 'population',
 'population_per_household',
 'rooms_per_household',
 'total_bedrooms',
 'total_rooms']

In [36]:
X_train = dv.fit_transform(dicts_train)

In [37]:
from sklearn.linear_model import LogisticRegression

In [38]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [39]:
model.fit(X_train, y_train)

In [85]:
model.intercept_[0]

0.0017082953511576749

In [86]:
model.coef_[0].round(3)

array([-0.134, -0.009, -0.034, -0.053,  0.831,  0.255, -0.592, -0.201,
        0.345, -0.   , -0.096, -0.045,  0.002, -0.328, -0.049,  0.045,
       -0.013])

In [87]:
model.predict(X_train)

array([225000, 500001,  66300, ..., 214600, 187500, 162500], dtype=int64)

In [92]:
y_pred = model.predict_proba(X_train)[:, 1]

In [93]:
y_pred.mean()

7.858256345991753e-05

In [83]:
dicts_val = df_val.to_dict(orient = 'records')
X_val = dv.fit_transform(dicts_val)
y_pred = model.predict(X_val)
y_pred.mean()

276188.50339147286

In [82]:
above_avg_decision_pred = (y_pred >= house_price_mean)
above_avg_decision_pred.astype(int).mean()

0.0

In [68]:
y_val

array([ 96700,  75500, 430900, ..., 344200, 387800, 184200], dtype=int64)

In [96]:
above_avg_decision = (y_val >= house_price_mean)

In [98]:
above_avg_decision.astype(int).mean().round(2)

0.41

### Question 5
  * Let's find the least useful feature using the feature elimination technique.
  * Train a model with all these features (using the same parameters as in Q4).
  * Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
  * For each feature, calculate the difference between the original accuracy and the accuracy without the feature.