In [119]:
import pandas as pd
import numpy as np

import xgboost as xgb

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text

from sklearn.ensemble import RandomForestRegressor

In [141]:
df = pd.read_csv(r'/home/zaib/ml-zoomcamp/housing.csv')

In [142]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


**preparing the dataset**

In [143]:
df = df[(df.ocean_proximity == '<1H OCEAN') | 
        (df.ocean_proximity == 'INLAND')]

df = df.fillna(0)

df.median_house_value = np.log1p(df.median_house_value)

In [144]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [145]:
y_full_train = df_full_train.median_house_value.values
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values


del df_full_train['median_house_value']
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [146]:
full_train_dicts = df_full_train.to_dict(orient='records')
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')
test_dicts = df_test.to_dict(orient='records')

dv = DictVectorizer(sparse=True)

X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
X_test = dv.transform(test_dicts)

**question 1**

In [147]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [148]:
print(export_text(dt, feature_names=list(dv.get_feature_names_out())))

|--- ocean_proximity=<1H OCEAN <= 0.50
|   |--- value: [11.61]
|--- ocean_proximity=<1H OCEAN >  0.50
|   |--- value: [12.30]



==> ocean_proximity

**question 2**

In [149]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

In [150]:
y_pred = rf.predict(X_val)

In [151]:
rmse = np.sqrt((mean_squared_error(y_val, y_pred)))
rmse

0.2449529003059715

==> 0.245

**question 3**

In [152]:
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, 
                               n_jobs=-1)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_val)
    
    rmse = np.sqrt((mean_squared_error(y_val, y_pred)))
    
    print(('%3s -> %.3f') % (n, rmse))

 10 -> 0.245
 20 -> 0.238
 30 -> 0.237
 40 -> 0.235
 50 -> 0.235
 60 -> 0.235
 70 -> 0.234
 80 -> 0.235
 90 -> 0.234
100 -> 0.234
110 -> 0.234
120 -> 0.234
130 -> 0.234
140 -> 0.234
150 -> 0.234
160 -> 0.233
170 -> 0.233
180 -> 0.234
190 -> 0.234
200 -> 0.234


==> n_estimators = 10

**question 4**

In [153]:
scores = []

for d in [10, 15, 20, 25]:
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=d, 
                                   random_state=1, 
                                   n_jobs=-1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_val)

        rmse = np.sqrt((mean_squared_error(y_val, y_pred)))

        scores.append((d, n, rmse))
        
columns = ['max_depth', 'n_estimators', 'rmse']
df_scores = pd.DataFrame(scores, columns=columns)

In [154]:
df_scores.sort_values(by='rmse', ascending=False)

Unnamed: 0,max_depth,n_estimators,rmse
0,10,10,0.250682
1,10,20,0.247455
2,10,30,0.246264
20,15,10,0.245727
7,10,80,0.245536
...,...,...,...
73,25,140,0.233635
77,25,180,0.233572
74,25,150,0.233535
75,25,160,0.233395


==> best max_depth = 10

**question 5**

In [155]:
n_estimators=10
max_depth=20

rf = RandomForestRegressor(n_estimators=n_estimators, 
                           max_depth=max_depth, 
                           random_state=1, 
                           n_jobs=-1)


rf.fit(X_train, y_train)

In [156]:
impurities = rf.feature_importances_
impurities

array([0.01495348, 0.03019435, 0.10189142, 0.0861758 , 0.33566849,
       0.29247297, 0.07380644, 0.02682069, 0.01624481, 0.02177155])

In [157]:
list(impurities).index(max(impurities))

4

In [175]:
list(dv.get_feature_names_out())

['households',
 'housing_median_age',
 'latitude',
 'longitude',
 'median_income',
 'ocean_proximity=<1H OCEAN',
 'ocean_proximity=INLAND',
 'population',
 'total_bedrooms',
 'total_rooms']

==> tmedian_income is the most important feature

**question 6**

xgboost is already installed

the comand to do it in the notebook is:

!pip install xgboost 

then one must exectute the following line to use it:

import xgboost as xgb

In [168]:
features = list(dv.get_feature_names_out())
features[5] = features[5].replace('=<', '_less_than_or_equal_to_')

In [169]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [170]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [171]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100, 
                  evals=watchlist)

y_pred = model.predict(dval)

[0]	train-rmse:0.44350	val-rmse:0.44250
[1]	train-rmse:0.36599	val-rmse:0.36916
[2]	train-rmse:0.31556	val-rmse:0.32382
[3]	train-rmse:0.28541	val-rmse:0.29965
[4]	train-rmse:0.26573	val-rmse:0.28417
[5]	train-rmse:0.25338	val-rmse:0.27463
[6]	train-rmse:0.24157	val-rmse:0.26770
[7]	train-rmse:0.23486	val-rmse:0.26423
[8]	train-rmse:0.22668	val-rmse:0.25820
[9]	train-rmse:0.21995	val-rmse:0.25444
[10]	train-rmse:0.21444	val-rmse:0.25179
[11]	train-rmse:0.21155	val-rmse:0.25054
[12]	train-rmse:0.20729	val-rmse:0.24791
[13]	train-rmse:0.20317	val-rmse:0.24628
[14]	train-rmse:0.20135	val-rmse:0.24630
[15]	train-rmse:0.19858	val-rmse:0.24522
[16]	train-rmse:0.19580	val-rmse:0.24445
[17]	train-rmse:0.19333	val-rmse:0.24395
[18]	train-rmse:0.19107	val-rmse:0.24284
[19]	train-rmse:0.18735	val-rmse:0.24127
[20]	train-rmse:0.18524	val-rmse:0.23978
[21]	train-rmse:0.18332	val-rmse:0.23963
[22]	train-rmse:0.18189	val-rmse:0.23892
[23]	train-rmse:0.18006	val-rmse:0.23847
[24]	train-rmse:0.17869	va

In [172]:
rmse = np.sqrt((mean_squared_error(y_val, y_pred)))
rmse

0.228623199980106

In [173]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100, 
                  evals=watchlist)

y_pred = model.predict(dval)

[0]	train-rmse:0.52449	val-rmse:0.52045
[1]	train-rmse:0.48736	val-rmse:0.48443
[2]	train-rmse:0.45433	val-rmse:0.45293
[3]	train-rmse:0.42533	val-rmse:0.42550
[4]	train-rmse:0.39987	val-rmse:0.40144
[5]	train-rmse:0.37822	val-rmse:0.38151
[6]	train-rmse:0.35887	val-rmse:0.36393
[7]	train-rmse:0.34177	val-rmse:0.34838
[8]	train-rmse:0.32701	val-rmse:0.33492
[9]	train-rmse:0.31411	val-rmse:0.32333
[10]	train-rmse:0.30326	val-rmse:0.31427
[11]	train-rmse:0.29355	val-rmse:0.30615
[12]	train-rmse:0.28519	val-rmse:0.29922
[13]	train-rmse:0.27760	val-rmse:0.29269
[14]	train-rmse:0.27116	val-rmse:0.28796
[15]	train-rmse:0.26538	val-rmse:0.28380
[16]	train-rmse:0.26012	val-rmse:0.27970
[17]	train-rmse:0.25583	val-rmse:0.27661
[18]	train-rmse:0.25203	val-rmse:0.27388
[19]	train-rmse:0.24818	val-rmse:0.27123
[20]	train-rmse:0.24512	val-rmse:0.26882
[21]	train-rmse:0.24149	val-rmse:0.26669
[22]	train-rmse:0.23895	val-rmse:0.26478
[23]	train-rmse:0.23594	val-rmse:0.26331
[24]	train-rmse:0.23305	va

In [174]:
rmse = np.sqrt((mean_squared_error(y_val, y_pred)))
rmse

0.23208927121609343

==> best rmse with eta = 0.1