In [87]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import (
    HistGradientBoostingRegressor,
    GradientBoostingRegressor, 
    RandomForestRegressor
)
import joblib

In [88]:
housing = datasets.fetch_california_housing()

In [89]:
print(housing.feature_names)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [90]:
print(housing.target_names)

['MedHouseVal']


In [91]:
x = housing.data
y = housing.target

In [92]:
print("number of rows:", len(x))
print("number of columns:", len(housing.feature_names))

number of rows: 20640
number of columns: 8


In [93]:
print(x[0])
print(y[0])

[   8.3252       41.            6.98412698    1.02380952  322.
    2.55555556   37.88       -122.23      ]
4.526


In [94]:
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.2,
    random_state=432
)

In [95]:
print("number of test samples:", len(x_test))
print("number of train samples:", len(x_train))

number of test samples: 4128
number of train samples: 16512


In [96]:
print(x_train[0])
print(y_train)


[   2.1442       52.            3.94886364    1.03693182  921.
    2.61647727   37.34       -121.88      ]
[1.889 1.173 3.179 ... 1.307 1.574 1.938]


In [97]:
model = LinearRegression()
model.fit(x_train,y_train)

In [98]:
y_pred = model.predict(x_test)
r2 = r2_score(y_test,y_pred)
print("r2 score:", r2)

r2 score: 0.6080229586580347


In [99]:
#baseline
#r2 score: 0.6080229586580347

In [100]:
print("old number of features:", len(x_train[0]))

old number of features: 8


In [101]:
poly = PolynomialFeatures()
x_train = poly.fit_transform(x_train)
x_test = poly.fit_transform(x_test)

In [102]:
print("old number of features:", len(x_train[0]))

old number of features: 45


In [103]:
model.fit(x_train,y_train) #train
y_pred = model.predict(x_test) #test
r2 = r2_score(y_test,y_pred) 

print("new r2 score:", r2)



new r2 score: 0.661024021201


In [104]:
# initialize models
LR = LinearRegression()
GBR = GradientBoostingRegressor()
RFR = RandomForestRegressor()

In [105]:
for model in [LR, GBR, RFR]:
    model.fit(x_train, y_train) # train
    y_pred = model.predict(x_test) # test
    r2 = r2_score(y_test, y_pred) # evaluate
    print("Model:", model)
    print("R2 score:", r2)
    print("-------------")

Model: LinearRegression()
R2 score: 0.661024021201
-------------
Model: GradientBoostingRegressor()
R2 score: 0.7919075332631215
-------------
Model: RandomForestRegressor()
R2 score: 0.8041492479675596
-------------


In [106]:
GBR = HistGradientBoostingRegressor()
RFR = RandomForestRegressor(
    n_jobs=-1
)

In [107]:
for model in [GBR, RFR]:
    model.fit(x_train, y_train) # train
    y_pred = model.predict(x_test) # test
    r2 = r2_score(y_test, y_pred) # evaluate
    print("Model:", model)
    print("R2 score:", r2)
    print("-------------")

Model: HistGradientBoostingRegressor()
R2 score: 0.8364843225412935
-------------
Model: RandomForestRegressor(n_jobs=-1)
R2 score: 0.8039571621176802
-------------


In [108]:
for i in [100, 200, 300, 400, 500]:
    model = HistGradientBoostingRegressor(
        max_iter=i
    )
    model.fit(x_train, y_train) # train
    y_pred = model.predict(x_test) # test
    r2 = r2_score(y_test, y_pred) # evaluate
    print("Number of trees:", i)
    print("R2 score:", r2)
    print("-------------")

Number of trees: 100
R2 score: 0.8336905637852174
-------------
Number of trees: 200
R2 score: 0.8445439941937778
-------------
Number of trees: 300
R2 score: 0.8457681481703487
-------------
Number of trees: 400
R2 score: 0.8473651815773411
-------------
Number of trees: 500
R2 score: 0.8362943644721282
-------------


In [109]:
for j in [0.1, 0.05, 0.001]:   
    for i in [100, 200, 300, 400, 500]:
        model = HistGradientBoostingRegressor(
            max_iter=i,
            learning_rate=j
        )
        model.fit(x_train, y_train) # train
        y_pred = model.predict(x_test) # test
        r2 = r2_score(y_test, y_pred) # evaluate
        print("Learning rate:", j)
        print("Number of trees:", i)
        print("R2 score:", r2)
        print("-------------")

Learning rate: 0.1
Number of trees: 100
R2 score: 0.8374014940900566
-------------
Learning rate: 0.1
Number of trees: 200
R2 score: 0.8435007430722015
-------------
Learning rate: 0.1
Number of trees: 300
R2 score: 0.8436840436778685
-------------
Learning rate: 0.1
Number of trees: 400
R2 score: 0.8432497022526845
-------------
Learning rate: 0.1
Number of trees: 500
R2 score: 0.8486830678074846
-------------
Learning rate: 0.05
Number of trees: 100
R2 score: 0.8233164449937116
-------------
Learning rate: 0.05
Number of trees: 200
R2 score: 0.8371094641855741
-------------
Learning rate: 0.05
Number of trees: 300
R2 score: 0.8444774664755369
-------------
Learning rate: 0.05
Number of trees: 400
R2 score: 0.8456216271913235
-------------
Learning rate: 0.05
Number of trees: 500
R2 score: 0.8467704463804773
-------------
Learning rate: 0.001
Number of trees: 100
R2 score: 0.11997373364862751
-------------
Learning rate: 0.001
Number of trees: 200
R2 score: 0.21851636040992706
-------

### Model saving

Best parameters:

max_iter=500

learning_rate=0.05

In [110]:
model = HistGradientBoostingRegressor(
    max_iter=500,
    learning_rate=0.05
)
model.fit(x_train, y_train)

joblib.dump(model, "my_model.joblib")

['my_model.joblib']

In [111]:
# evaluate existing model
y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)
print("existing model score:", r2)

existing model score: 0.8423961701898861


In [112]:
saved_model = joblib.load("my_model.joblib")

y_pred = saved_model.predict(x_test)
r2 = r2_score(y_test, y_pred)
print("saved model score:", r2)

saved model score: 0.8423961701898861



- try optimizing more than 2 hyperparameters, reaching scores above 86%
- instead of simply printing the R2 scores, try visualising them with Pandas or Matplotlib
- Find even better algorithms that outperform HistGradientBoostingRegressor
- Find a different dataset and analyze it like we've analyzed this one.