In [39]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [12]:
df = pd.read_csv("housing.csv")

In [13]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [14]:
df = df[['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 
        'households','median_income','median_house_value', 'ocean_proximity']]
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [15]:
print(df.columns[df.isnull().any().tolist()].tolist())

['total_bedrooms']


In [16]:
df['total_bedrooms'].fillna(0, inplace=True)

In [17]:
df['rooms_per_household'] = df['total_rooms']/df['households']
df['bedrooms_per_room'] = df['total_bedrooms']/df['total_rooms']
df['population_per_household'] = df['population']/df['households']

In [18]:
df['ocean_proximity'].mode()[0]

'<1H OCEAN'

In [21]:
X = df.drop(columns=['median_house_value'], axis=1).copy()
y = df['median_house_value']

X_full, X_test, y_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.25, random_state=42)

X_train.reset_index(drop=True, inplace=True)
X_val.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [22]:
cols = X_train.select_dtypes(exclude='object').columns.tolist()
df[cols].corr()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
latitude,1.0,-0.924664,0.011173,-0.0361,-0.065318,-0.108785,-0.071035,-0.079809,0.106389,-0.104112,0.002366
longitude,-0.924664,1.0,-0.108197,0.044568,0.068082,0.099773,0.05531,-0.015176,-0.02754,0.084836,0.002476
housing_median_age,0.011173,-0.108197,1.0,-0.361262,-0.317063,-0.296244,-0.302916,-0.119034,-0.153277,0.125396,0.013191
total_rooms,-0.0361,0.044568,-0.361262,1.0,0.920196,0.857126,0.918484,0.19805,0.133798,-0.174583,-0.024581
total_bedrooms,-0.065318,0.068082,-0.317063,0.920196,1.0,0.866266,0.966507,-0.007295,0.002717,0.122205,-0.028019
population,-0.108785,0.099773,-0.296244,0.857126,0.866266,1.0,0.907222,0.004834,-0.072213,0.031397,0.069863
households,-0.071035,0.05531,-0.302916,0.918484,0.966507,0.907222,1.0,0.013033,-0.080598,0.059818,-0.027309
median_income,-0.079809,-0.015176,-0.119034,0.19805,-0.007295,0.004834,0.013033,1.0,0.326895,-0.573836,0.018766
rooms_per_household,0.106389,-0.02754,-0.153277,0.133798,0.002717,-0.072213,-0.080598,0.326895,1.0,-0.387465,-0.004852
bedrooms_per_room,-0.104112,0.084836,0.125396,-0.174583,0.122205,0.031397,0.059818,-0.573836,-0.387465,1.0,0.003047


In [23]:
def binf(var):
    avg = np.mean(var)
    above_avg = (var > avg).astype('int')
    return above_avg

y_train_bin = binf(y_train)
y_val_bin = binf(y_val)
y_test_bin = binf(y_test)

In [31]:
inf_score = mutual_info_score(X_train['ocean_proximity'], y_train_bin)
inf_score

0.10138385763624205

In [27]:
X_train_dict = X_train.to_dict(orient='records')
X_val_dict = X_val.to_dict(orient='records')
X_test_dict = X_test.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
df_train = dv.fit_transform(X_train_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(df_train, y_train_bin)

In [30]:
df_val = dv.transform(X_val_dict)
y_preds = model.predict_proba(df_val)[:, 1]

accuracy = (y_val_bin == (y_preds>=0.5)).mean()
accuracy

0.8381782945736435

In [32]:
X_train_trans = dv.fit_transform(X_train_dict)
X_val_trans = dv.fit_transform(X_val_dict)
X_test_trans = dv.fit_transform(X_test_dict)

feature = dv.get_feature_names_out().tolist()

df_train = pd.DataFrame(X_train_trans, columns=feature)
df_val = pd.DataFrame(X_val_trans, columns=feature)
df_test = pd.DataFrame(X_test_trans, columns=feature)

In [33]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(df_train, y_train_bin)
y_preds = model.predict(df_val)
accuracy = (y_val_bin == y_preds).mean()
accuracy

0.8381782945736435

In [38]:
def check_diff(col):
    train_data = df_train.drop(col, axis=1)
    test_data = df_val.drop(col, axis=1)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(train_data, y_train_bin)
    predictions = model.predict(test_data)
    score = (y_val_bin == predictions).mean()

    return  abs(accuracy - score)

diff = {}
for column in feature:
    diff[column] = check_diff(column)
    
diff

{'bedrooms_per_room': 0.0021802325581395943,
 'households': 0.003875968992248069,
 'housing_median_age': 0.007509689922480689,
 'latitude': 0.0029069767441860517,
 'longitude': 0.002664728682170603,
 'median_income': 0.05159883720930236,
 'ocean_proximity=<1H OCEAN': 0.002422480620155043,
 'ocean_proximity=INLAND': 0.0016957364341085857,
 'ocean_proximity=ISLAND': 0.0009689922480620172,
 'ocean_proximity=NEAR BAY': 0.002422480620155043,
 'ocean_proximity=NEAR OCEAN': 0.00363372093023262,
 'population': 0.01065891472868219,
 'population_per_household': 0.0016957364341085857,
 'rooms_per_household': 0.001211240310077577,
 'total_bedrooms': 0.0019379844961240345,
 'total_rooms': 0.0004844961240310086}

In [43]:
alpha = [0, 0.01, 0.1, 1, 10]
for a in alpha:
    model = Ridge(alpha=a, solver='sag', random_state=42)
    model.fit(df_train, np.log1p(y_train))
    y_val_preds = model.predict(df_val)
    mse = mean_squared_error(np.log1p(y_val), y_val_preds)
    print("alpha :: %f, RMSE::%.7f" %(a, np.sqrt(mse)))

alpha :: 0.000000, RMSE::0.5240636
alpha :: 0.010000, RMSE::0.5240636
alpha :: 0.100000, RMSE::0.5240636
alpha :: 1.000000, RMSE::0.5240636
alpha :: 10.000000, RMSE::0.5240636
