# Dataset

In [1]:
import pandas as pd
import numpy as np

    

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv')

# Features

In [3]:
cols = ['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity']
df_features = df[cols]
df_features

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,39.48,-121.09,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,39.49,-121.21,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,39.43,-121.22,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,39.43,-121.32,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


# Data preparation

In [14]:
df_features_no_missing = df_features.fillna(0)

df_features_no_missing['rooms_per_household'] = df_features_no_missing['total_rooms'] / df_features_no_missing['households']
df_features_no_missing['bedrooms_per_room'] = df_features_no_missing['total_bedrooms'] / df_features_no_missing['total_rooms']
df_features_no_missing['population_per_household'] = df_features_no_missing['population'] / df_features_no_missing['households']
df_features_no_missing

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.802260
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,39.48,-121.09,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,5.045455,0.224625,2.560606
20636,39.49,-121.21,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,6.114035,0.215208,3.122807
20637,39.43,-121.22,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND,5.205543,0.215173,2.325635
20638,39.43,-121.32,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,5.329513,0.219892,2.123209


# Question 1

In [24]:
ocean_proximity_mode = df_features_no_missing.ocean_proximity.mode()[0]
ocean_proximity_mode

'<1H OCEAN'

# Split the data

In [25]:
from sklearn.model_selection import train_test_split

In [32]:
df_full_train, df_test = train_test_split(df_features_no_missing, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [35]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [38]:
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

# Question 2

In [60]:
# Select the numerical dataset
numerical = ['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'rooms_per_household', 'bedrooms_per_room', 'population_per_household']

df_trail_num = df_train[numerical]

# Create correlation matrix. We are taking abs since we need the highers and it can be negitive or positive.
correlation_matrix = df_trail_num.corr().abs()
correlation_matrix

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
housing_median_age,1.0,0.363522,0.324156,0.292476,0.306119,0.119591,0.181275,0.129456,0.012167
total_rooms,0.363522,1.0,0.931546,0.853219,0.921441,0.198951,0.168926,0.194185,0.029452
total_bedrooms,0.324156,0.931546,1.0,0.87734,0.979399,0.009833,0.010381,0.078094,0.034301
population,0.292476,0.853219,0.87734,1.0,0.906841,0.000849,0.07621,0.031592,0.064998
households,0.306119,0.921441,0.979399,0.906841,1.0,0.011925,0.085832,0.058004,0.032522
median_income,0.119591,0.198951,0.009833,0.000849,0.011925,1.0,0.394154,0.616617,0.000454
rooms_per_household,0.181275,0.168926,0.010381,0.07621,0.085832,0.394154,1.0,0.500589,0.001801
bedrooms_per_room,0.129456,0.194185,0.078094,0.031592,0.058004,0.616617,0.500589,1.0,0.002851
population_per_household,0.012167,0.029452,0.034301,0.064998,0.032522,0.000454,0.001801,0.002851,1.0


In [79]:
corr_matrix_sorted = correlation_matrix.unstack().sort_values(ascending = False)
df_corr_matrix_sorted = corr_matrix_sorted.to_frame()
df_corr_matrix_sorted.drop_duplicates().iloc[1]

0    0.979399
Name: (households, total_bedrooms), dtype: float64

# Make `median_house_value` binary

In [96]:
df_all_features = df_features_no_missing.copy()

median_house_value_mean = df_all_features.median_house_value.mean()
df_all_features['above_average'] = np.where(df_all_features['median_house_value'] >= median_house_value_mean, 1, 0)
df_all_features = df_all_features.drop('median_house_value', axis=1)

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY,6.984127,0.146591,2.555556,1
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY,6.238137,0.155797,2.109842,1
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY,8.288136,0.129516,2.802260,1
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY,5.817352,0.184458,2.547945,1
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY,6.281853,0.172096,2.181467,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,39.48,-121.09,25.0,1665.0,374.0,845.0,330.0,1.5603,INLAND,5.045455,0.224625,2.560606,0
20636,39.49,-121.21,18.0,697.0,150.0,356.0,114.0,2.5568,INLAND,6.114035,0.215208,3.122807,0
20637,39.43,-121.22,17.0,2254.0,485.0,1007.0,433.0,1.7000,INLAND,5.205543,0.215173,2.325635,0
20638,39.43,-121.32,18.0,1860.0,409.0,741.0,349.0,1.8672,INLAND,5.329513,0.219892,2.123209,0


In [109]:
from sklearn.model_selection import train_test_split

In [110]:
df_full_train, df_test = train_test_split(df_all_features, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [111]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [112]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

# Question 3

In [113]:
from sklearn.metrics import mutual_info_score

score = mutual_info_score(df_train.ocean_proximity, df_train.above_average)
round(score, 3) # with rount 2 it gives 0.1 which is not in the answers

0.101

# Question 4

In [114]:
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [115]:
from sklearn.feature_extraction import DictVectorizer

In [127]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [128]:
from sklearn.linear_model import LogisticRegression

In [126]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

model.fit(X_train, y_train)

In [129]:
from sklearn.metrics import accuracy_score

In [130]:
y_pred = model.predict(X_val)

accuracy = np.round(accuracy_score(y_val, y_pred),2)
print(accuracy)

0.84


# Question 5

In [151]:
features_orig = df_train.columns.to_list()
accuracy_orig = accuracy
df_feature_usefulness = pd.DataFrame(columns=['feature', 'accuracy_without_feature', 'accuracy_diff', 'accuracy_diff_abs'])

for feature in features_orig:
    # Prepare the feature list
    features = features_orig.copy()
    features.remove(feature)

    # Train the model
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[features].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # Validate the model
    val_dict = df_val[features].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)

    # Calculate the accuracy score and compare to the original score
    accuracy_without_feature = accuracy_score(y_val, y_pred)
    accuracy_diff = accuracy_orig - accuracy_without_feature
    df_feature_usefulness.loc[len(df_feature_usefulness.index)] = [feature, accuracy_without_feature, accuracy_diff, abs(accuracy_diff)]
    
# Find the feature with the smallest difference
feature_filter = ['total_rooms', 'total_bedrooms', 'population', 'households']
df_feature_usefulness_ascending = df_feature_usefulness[df_feature_usefulness['feature'].isin(feature_filter)].sort_values(by=['accuracy_diff_abs'])
df_feature_usefulness_ascending.iloc[0]['feature']

'total_rooms'