In [27]:
import pandas as pd
from scipy import stats
import random
from sklearn.ensemble import RandomForestClassifier

In [32]:
housing_df = pd.read_csv("./housing_dataset.csv")
housing_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.74,39.71,16.0,255.0,73.0,85.0,38.0,1.6607,14999.0,INLAND
1,-117.02,36.40,19.0,619.0,239.0,490.0,164.0,2.1000,14999.0,INLAND
2,-117.86,34.24,52.0,803.0,267.0,628.0,225.0,4.1932,14999.0,INLAND
3,-123.17,40.31,36.0,98.0,28.0,18.0,8.0,0.5360,14999.0,INLAND
4,-118.33,34.15,39.0,493.0,168.0,259.0,138.0,2.3667,17500.0,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
20635,-118.90,34.14,35.0,1503.0,263.0,576.0,216.0,5.1457,500001.0,<1H OCEAN
20636,-118.69,34.18,11.0,1177.0,138.0,415.0,119.0,10.0472,500001.0,<1H OCEAN
20637,-118.80,34.19,4.0,15572.0,2222.0,5495.0,2152.0,8.6499,500001.0,<1H OCEAN
20638,-118.69,34.21,10.0,3663.0,409.0,1179.0,371.0,12.5420,500001.0,<1H OCEAN


In [33]:
housing_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20629.0,20625.0,20433.0,20621.0,20626.0,20632.0,20560.0
mean,-119.569704,35.631861,28.63338,2636.314812,537.870553,1425.78551,499.675555,3.871003,207334.239348
std,2.003532,2.135952,12.584167,2182.184573,421.38507,1132.799376,382.405161,1.899969,115352.171422
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1448.0,296.0,787.0,280.0,2.5634,120600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,410.0,3.53525,180400.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.7437,265225.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [34]:
housing_df.isnull().sum()

longitude               0
latitude                0
housing_median_age     11
total_rooms            15
total_bedrooms        207
population             19
households             14
median_income           8
median_house_value     80
ocean_proximity       154
dtype: int64

In [9]:
Q1 = housing_df["median_house_value"].quantile(0.25)
Q3 = housing_df["median_house_value"].quantile(0.75)
IQR = Q3 - Q1

LB = Q1 - 1.5 * IQR
UB = Q3 + 1.5 * IQR

outliers = housing_df[(housing_df["median_house_value"] < LB) | (housing_df["median_house_value"] > UB)]
outliers

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
19568,-118.37,33.81,33.0,5057.0,790.0,2021.0,748.0,6.8553,482200.0,NEAR OCEAN
19569,-117.19,32.69,35.0,2921.0,438.0,1042.0,415.0,6.3612,482700.0,NEAR OCEAN
19570,-117.87,33.60,34.0,3415.0,779.0,1275.0,718.0,4.4980,482900.0,<1H OCEAN
19571,-118.59,34.14,19.0,1303.0,155.0,450.0,145.0,10.5511,483100.0,<1H OCEAN
19572,-122.18,37.81,30.0,292.0,38.0,126.0,52.0,6.3624,483300.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-118.90,34.14,35.0,1503.0,263.0,576.0,216.0,5.1457,500001.0,<1H OCEAN
20636,-118.69,34.18,11.0,1177.0,138.0,415.0,119.0,10.0472,500001.0,<1H OCEAN
20637,-118.80,34.19,4.0,15572.0,2222.0,5495.0,2152.0,8.6499,500001.0,<1H OCEAN
20638,-118.69,34.21,10.0,3663.0,409.0,1179.0,371.0,12.5420,500001.0,<1H OCEAN


In [18]:
z_scores = stats.zscore(housing_df["median_house_value"])
threshold = 2
outliers_zscore = housing_df[abs(z_scores) > threshold]
outliers_zscore

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity


In [19]:
mean_value = housing_df['total_bedrooms'].mean()
housing_df['total_bedrooms'].fillna(mean_value, inplace=True)




In [20]:
median_value = housing_df['total_bedrooms'].median()
housing_df['total_bedrooms'].fillna(median_value, inplace=True)

In [21]:
# Group by 'ocean_proximity' and impute missing 'total_bedrooms' values using the mode within each group
housing_df['total_bedrooms'] = housing_df.groupby('ocean_proximity')['total_bedrooms'].transform(lambda x: x.fillna(x.mode().iloc[0]))


In [36]:
with_missing = housing_df[housing_df['ocean_proximity'].isnull()]
without_missing = housing_df[~housing_df['ocean_proximity'].isnull()]
with_missing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
8028,-118.21,34.04,52.0,846.0,271.0,1153.0,281.0,2.1923,155000.0,
8029,-118.05,33.90,36.0,1047.0,227.0,975.0,239.0,3.1897,155000.0,
8030,-118.18,33.90,25.0,1709.0,442.0,1177.0,410.0,2.4333,155000.0,
8031,-118.18,33.77,39.0,1645.0,547.0,1339.0,499.0,1.5536,155000.0,
8032,-118.66,34.43,9.0,2356.0,469.0,1556.0,386.0,3.7750,155000.0,
...,...,...,...,...,...,...,...,...,...,...
20607,-122.10,37.36,35.0,2063.0,266.0,676.0,252.0,8.5294,500001.0,
20608,-122.09,37.35,37.0,1795.0,285.0,791.0,261.0,7.5794,500001.0,
20609,-122.09,37.35,30.0,1502.0,186.0,501.0,180.0,10.0259,500001.0,
20610,-122.14,37.36,23.0,11294.0,1377.0,3840.0,1367.0,12.1387,500001.0,


In [37]:
with_missing = housing_df[housing_df['ocean_proximity'].isnull()]
without_missing = housing_df[~housing_df['ocean_proximity'].isnull()]

# Train a model to predict the missing values
clf = RandomForestClassifier()
clf.fit(without_missing[['longitude', 'latitude']], without_missing['ocean_proximity'])
predictions = clf.predict(with_missing[['longitude', 'latitude']])

# Fill in missing values with the predicted values
housing_df.loc[housing_df['ocean_proximity'].isnull(), 'ocean_proximity'] = predictions

In [40]:
housing_df.isnull().sum()

longitude               0
latitude                0
housing_median_age     11
total_rooms            15
total_bedrooms        207
population             19
households             14
median_income           8
median_house_value     80
ocean_proximity         0
dtype: int64