# TPOT California housing

- https://github.com/EpistasisLab/tpot/

-  https://inria.github.io/scikit-learn-mooc/python_scripts/datasets_california_housing.html

## Setup libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.datasets import fetch_california_housing

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [6]:
from tpot import TPOTRegressor
from tpot.config import regressor_config_dict



In [7]:
random_state = 786

## Get data

In [8]:
california_housing_data = fetch_california_housing(as_frame=True)

In [9]:
california_housing = california_housing_data.data

In [10]:
california_housing["target"] = california_housing_data.target

In [11]:
california_housing.dtypes.value_counts()

float64    9
dtype: int64

In [12]:
california_housing.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'target'],
      dtype='object')

Split data into training data and unseen data for predictions

In [13]:
housing_data = california_housing.sample(frac=0.9, random_state=random_state)
data_unseen = california_housing.drop(housing_data.index)

In [14]:
f"training data: {housing_data.shape}, prediction data: {data_unseen.shape}"

'training data: (18576, 9), prediction data: (2064, 9)'

In [15]:
housing_data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
10877,3.9844,38.0,5.403042,1.140684,1236.0,4.69962,33.72,-117.88,1.653
16711,3.5682,10.0,6.100875,1.023907,4660.0,2.717201,35.03,-120.5,2.773
13597,2.2917,44.0,3.514019,0.813084,273.0,2.551402,34.09,-117.28,0.908
17134,5.6234,42.0,5.334225,1.101604,794.0,2.122995,37.46,-122.15,3.796
4392,2.5521,38.0,2.859848,1.011364,2899.0,3.660354,34.08,-118.27,1.575


## Get TPOT data

In [16]:
housing_data.drop("target", axis=1)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
10877,3.9844,38.0,5.403042,1.140684,1236.0,4.699620,33.72,-117.88
16711,3.5682,10.0,6.100875,1.023907,4660.0,2.717201,35.03,-120.50
13597,2.2917,44.0,3.514019,0.813084,273.0,2.551402,34.09,-117.28
17134,5.6234,42.0,5.334225,1.101604,794.0,2.122995,37.46,-122.15
4392,2.5521,38.0,2.859848,1.011364,2899.0,3.660354,34.08,-118.27
...,...,...,...,...,...,...,...,...
15026,2.6678,34.0,5.076220,1.102134,1592.0,2.426829,32.77,-117.01
11161,2.4609,23.0,4.344000,1.072000,825.0,3.300000,33.83,-118.01
19386,2.4717,37.0,4.748634,1.101093,936.0,2.557377,37.77,-120.85
10903,1.9309,31.0,3.762821,1.100427,1810.0,3.867521,33.75,-117.86


## Use TPOT without LightGBM

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    housing_data.drop("target", axis=1),
    housing_data.target,
    train_size=0.75,
    test_size=0.25,
    random_state=random_state,
)
# about 5 minutes to execute
tpot = TPOTRegressor(
    generations=5,
    population_size=50,
    verbosity=2,
    config_dict="TPOT light",
    random_state=random_state,
)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export("tpot_ca_housing.py")

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -0.3511771786398378

Generation 2 - Current best internal CV score: -0.35004596761106127

Generation 3 - Current best internal CV score: -0.35004596761106127

Generation 4 - Current best internal CV score: -0.3418461310118981

Generation 5 - Current best internal CV score: -0.3418461310118981

Best pipeline: DecisionTreeRegressor(LassoLarsCV(StandardScaler(input_matrix), normalize=False), max_depth=9, min_samples_leaf=17, min_samples_split=3)
-0.3657954855098522


## Use TPOT with LightGBMRegressor

In [18]:
regressor_config_dict = {}

In [19]:
regressor_config_dict["lightgbm.LGBMRegressor"] = {
    "boosting_type": ["gbdt", "dart"],
    "min_child_samples": [1, 5, 7, 10, 15, 20, 35, 50, 100, 200, 500, 1000],
    "num_leaves": [
        2,
        4,
        7,
        10,
        15,
        20,
        25,
        30,
        35,
        40,
        50,
        65,
        80,
        100,
        125,
        150,
        200,
        250,
        500,
    ],
    "colsample_bytree": [0.7, 0.9, 1.0],
    "subsample": [0.7, 0.9, 1.0],
    "learning_rate": [0.01, 0.05, 0.1],
    "n_estimators": [
        5,
        20,
        35,
        50,
        75,
        100,
        150,
        200,
        350,
        500,
        750,
        1000,
        1500,
        2000,
    ],
}

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    housing_data.drop("target", axis=1),
    housing_data.target,
    train_size=0.75,
    test_size=0.25,
    random_state=random_state,
)

Very slow!! More than 1 hour

In [21]:
tpot = TPOTRegressor(
    generations=2,
    population_size=20,
    verbosity=2,
    config_dict=regressor_config_dict,
    random_state=random_state,
)

In [22]:
# tpot.fit(X_train, y_train)

In [23]:
# print(tpot.score(X_test, y_test))
# tpot.export("tpot_ca_housing2.py")

Use the generated pipeline with TPOT and LGBMRegressor

In [24]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import FunctionTransformer
from copy import copy

In [25]:
tpot_data = housing_data

In [26]:
features = tpot_data.drop("target", axis=1)
(
    training_features,
    testing_features,
    training_target,
    testing_target,
) = train_test_split(features, tpot_data["target"], random_state=786)

In [27]:
training_features.shape, testing_features.shape

((13932, 8), (4644, 8))

In [28]:
# Average CV score on the training set was: -0.18957608561214184
exported_pipeline = make_pipeline(
    make_union(FunctionTransformer(copy), FunctionTransformer(copy)),
    LGBMRegressor(
        boosting_type="dart",
        colsample_bytree=0.7,
        learning_rate=0.1,
        min_child_samples=10,
        n_estimators=1000,
        num_leaves=25,
        subsample=0.7,
    ),
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, "random_state", 786)

In [29]:
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

In [30]:
results

array([1.03853863, 1.70204281, 0.80637984, ..., 1.41110387, 2.4489894 ,
       2.84154706])

In [31]:
mse = mean_squared_error(testing_target, results)
r2 = r2_score(testing_target, results)

In [32]:
print(f'mse: {mse}, r2: {r2}')

mse: 0.21549949177407549, r2: 0.8406993781824423
