In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

In [2]:
df = pd.read_csv("../data/housing.csv")

In [3]:
df['ocean_proximity'].value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [4]:
df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]

In [5]:
df['ocean_proximity'].value_counts()

ocean_proximity
<1H OCEAN    9136
INLAND       6551
Name: count, dtype: int64

In [6]:
df.fillna(0, inplace=True)
df['median_house_value'] = np.log1p(df['median_house_value'])

In [7]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_valid = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [10]:
y_train = df_train.pop('median_house_value')
y_valid = df_valid.pop('median_house_value')
y_test = df_test.pop('median_house_value')

KeyError: 'median_house_value'

In [11]:
dv = DictVectorizer(sparse=False)
train_dicts = df_train.to_dict(orient='records')
valid_dicts = df_valid.to_dict(orient='records')
test_dicts = df_test.to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)
X_valid = dv.transform(valid_dicts)
X_test = dv.transform(test_dicts)

In [12]:
from sklearn.tree import DecisionTreeRegressor

In [13]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [14]:
from sklearn.tree import export_text

In [15]:
print(export_text(dt, feature_names=dv.get_feature_names_out()))

|--- ocean_proximity=<1H OCEAN <= 0.50
|   |--- value: [11.61]
|--- ocean_proximity=<1H OCEAN >  0.50
|   |--- value: [12.30]



In [16]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

In [20]:
from sklearn.metrics import mean_squared_error

y_preds = rf.predict(X_valid)

np.sqrt(mean_squared_error(y_valid, y_preds))

0.24491083521701304

In [22]:
for n_estimators in np.arange(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n_estimators, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_preds = rf.predict(X_valid)
    print(f"n_estimators={n_estimators}, RMSE={np.sqrt(mean_squared_error(y_valid, y_preds))}")

n_estimators=10, RMSE=0.244910835217013
n_estimators=20, RMSE=0.23838383809695302
n_estimators=30, RMSE=0.23613492096113048
n_estimators=40, RMSE=0.23467126593123264
n_estimators=50, RMSE=0.23443881201129232
n_estimators=60, RMSE=0.23402332047426744
n_estimators=70, RMSE=0.23401263090670457
n_estimators=80, RMSE=0.23431093830996508
n_estimators=90, RMSE=0.23428989607895007
n_estimators=100, RMSE=0.2341775609617507
n_estimators=110, RMSE=0.23420968124204716
n_estimators=120, RMSE=0.23392295616353403
n_estimators=130, RMSE=0.2337861355701311
n_estimators=140, RMSE=0.23361781805268358
n_estimators=150, RMSE=0.23349628282409648
n_estimators=160, RMSE=0.2333340541277322
n_estimators=170, RMSE=0.23332086448035927
n_estimators=180, RMSE=0.23356869005302944
n_estimators=190, RMSE=0.23384749873832764
n_estimators=200, RMSE=0.23381154821605382


In [23]:
scores = []
for max_depth in [10, 15, 20, 25]:
    for n_estimators in np.arange(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_preds = rf.predict(X_valid)
        score = np.sqrt(mean_squared_error(y_valid, y_preds))
        scores.append((max_depth, n_estimators, score))

In [28]:
sc = pd.DataFrame(scores)
sc.columns = ['max_depth', 'n_estimators', 'RMSE']
sc

Unnamed: 0,max_depth,n_estimators,RMSE
0,10,10,0.244911
1,10,20,0.238384
2,10,30,0.236135
3,10,40,0.234671
4,10,50,0.234439
...,...,...,...
75,25,160,0.233334
76,25,170,0.233321
77,25,180,0.233569
78,25,190,0.233847


In [30]:
sc.groupby('max_depth').agg({'RMSE': ['count', 'mean']})

Unnamed: 0_level_0,RMSE,RMSE
Unnamed: 0_level_1,count,mean
max_depth,Unnamed: 1_level_2,Unnamed: 2_level_2
10,20,0.234813
15,20,0.234813
20,20,0.234813
25,20,0.234813


In [31]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)
y_preds = rf.predict(X_valid)
score = np.sqrt(mean_squared_error(y_valid, y_preds))

In [35]:
list(zip(rf.feature_importances_, dv.get_feature_names_out()))

[(0.014464729672647775, 'households'),
 (0.03001847738030148, 'housing_median_age'),
 (0.10249128069337625, 'latitude'),
 (0.08603838292900823, 'longitude'),
 (0.3355403365620536, 'median_income'),
 (0.2188198543307832, 'ocean_proximity=<1H OCEAN'),
 (0.14745955406828115, 'ocean_proximity=INLAND'),
 (0.028272427270634358, 'population'),
 (0.015736633318908736, 'total_bedrooms'),
 (0.021158323774005144, 'total_rooms')]

In [33]:
dv.get_feature_names_out()

array(['households', 'housing_median_age', 'latitude', 'longitude',
       'median_income', 'ocean_proximity=<1H OCEAN',
       'ocean_proximity=INLAND', 'population', 'total_bedrooms',
       'total_rooms'], dtype=object)