In [None]:
import os
import sys
import warnings

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing, load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

from autofeat import AutoFeatRegressor

%load_ext autoreload
%autoreload 2

In [None]:
datasets = ["diabetes", "california_housing", "concrete", "airfoil", "wine_quality"]


# same interface for loading all datasets - adapt the datapath
# to where you've downloaded (and renamed) the datasets
def load_regression_dataset(name, datapath="../my_datasets/regression/"):
    # load one of the datasets as X and y (and possibly units)
    units = {}
    if name == "california_housing":
        # sklearn california housing dataset
        X, y = fetch_california_housing(return_X_y=True)

    elif name == "diabetes":
        # sklearn diabetes dataset
        X, y = load_diabetes(return_X_y=True)

    elif name == "concrete":
        # https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength
        # Cement (component 1) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Blast Furnace Slag (component 2) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Fly Ash (component 3) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Water (component 4) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Superplasticizer (component 5) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Coarse Aggregate (component 6) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Fine Aggregate (component 7)    -- quantitative -- kg in a m3 mixture -- Input Variable
        # Age -- quantitative -- Day (1~365) -- Input Variable
        # Concrete compressive strength -- quantitative -- MPa -- Output Variable
        df = pd.read_csv(os.path.join(datapath, "concrete.csv"))
        X = df.iloc[:, :8].to_numpy()
        y = df.iloc[:, 8].to_numpy()

    elif name == "forest_fires":
        # https://archive.ics.uci.edu/ml/datasets/Forest+Fires
        # 1. X - x-axis spatial coordinate within the Montesinho park map: 1 to 9
        # 2. Y - y-axis spatial coordinate within the Montesinho park map: 2 to 9
        # 3. month - month of the year: 'jan' to 'dec'
        # 4. day - day of the week: 'mon' to 'sun'
        # 5. FFMC - FFMC index from the FWI system: 18.7 to 96.20
        # 6. DMC - DMC index from the FWI system: 1.1 to 291.3
        # 7. DC - DC index from the FWI system: 7.9 to 860.6
        # 8. ISI - ISI index from the FWI system: 0.0 to 56.10
        # 9. temp - temperature in Celsius degrees: 2.2 to 33.30
        # 10. RH - relative humidity in %: 15.0 to 100
        # 11. wind - wind speed in km/h: 0.40 to 9.40
        # 12. rain - outside rain in mm/m2 : 0.0 to 6.4
        # 13. area - the burned area of the forest (in ha): 0.00 to 1090.84
        # (this output variable is very skewed towards 0.0, thus it may make sense to model with the logarithm transform).
        # --> first 4 are ignored
        df = pd.read_csv(os.path.join(datapath, "forest_fires.csv"))
        X = df.iloc[:, 4:12].to_numpy()
        y = df.iloc[:, 12].to_numpy()
        # perform transformation as they suggested
        y = np.log(y + 1)

    elif name == "wine_quality":
        # https://archive.ics.uci.edu/ml/datasets/Wine+Quality
        # Input variables (based on physicochemical tests):
        # 1 - fixed acidity
        # 2 - volatile acidity
        # 3 - citric acid
        # 4 - residual sugar
        # 5 - chlorides
        # 6 - free sulfur dioxide
        # 7 - total sulfur dioxide
        # 8 - density
        # 9 - pH
        # 10 - sulphates
        # 11 - alcohol
        # Output variable (based on sensory data):
        # 12 - quality (score between 0 and 10)
        df_red = pd.read_csv(os.path.join(datapath, "winequality-red.csv"), sep=";")
        df_white = pd.read_csv(os.path.join(datapath, "winequality-white.csv"), sep=";")
        # add additional categorical feature for red or white
        X = np.hstack(
            [
                np.vstack([df_red.iloc[:, :-1].to_numpy(), df_white.iloc[:, :-1].to_numpy()]),
                np.array([[1] * len(df_red) + [0] * len(df_white)]).T,
            ]
        )
        y = np.hstack([df_red["quality"].to_numpy(), df_white["quality"].to_numpy()])

    elif name == "airfoil":
        # https://archive.ics.uci.edu/ml/datasets/Airfoil+Self-Noise
        # This problem has the following inputs:
        # 1. Frequency, in Hertz.
        # 2. Angle of attack, in degrees.
        # 3. Chord length, in meters.
        # 4. Free-stream velocity, in meters per second.
        # 5. Suction side displacement thickness, in meters.
        # The only output is:
        # 6. Scaled sound pressure level, in decibels.
        units = {"x001": "Hz", "x003": "m", "x004": "m/sec", "x005": "m"}
        df = pd.read_csv(
            os.path.join(datapath, "airfoil_self_noise.tsv"), header=None, names=["x1", "x2", "x3", "x4", "x5", "y"], sep="\t"
        )
        X = df.iloc[:, :5].to_numpy()
        y = df["y"].to_numpy()

    else:
        raise RuntimeError("Unknown dataset %r" % name)
    return np.array(X, dtype=float), np.array(y, dtype=float), units


def test_model(dataset, model, param_grid):
    # load data
    X, y, _ = load_regression_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    if model.__class__.__name__ == "SVR":
        sscaler = StandardScaler()
        X_train = sscaler.fit_transform(X_train)
        X_test = sscaler.transform(X_test)
    # train model on train split incl cross-validation for parameter selection
    gsmodel = GridSearchCV(model, param_grid, scoring="neg_mean_squared_error", cv=5)
    gsmodel.fit(X_train, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test)))
    return gsmodel.best_estimator_


def test_autofeat(dataset, feateng_steps=2):
    # load data
    X, y, units = load_regression_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    # run autofeat
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=feateng_steps, units=units)
    # fit autofeat on less data, otherwise ridge reg model with xval will overfit on new features
    X_train_tr = afreg.fit_transform(X_train, y_train)
    X_test_tr = afreg.transform(X_test)
    print("autofeat new features:", len(afreg.new_feat_cols_))
    print("autofeat MSE on training data:", mean_squared_error(y_train, afreg.predict(X_train_tr)))
    print("autofeat MSE on test data:", mean_squared_error(y_test, afreg.predict(X_test_tr)))
    print("autofeat R^2 on training data:", r2_score(y_train, afreg.predict(X_train_tr)))
    print("autofeat R^2 on test data:", r2_score(y_test, afreg.predict(X_test_tr)))
    # train rreg on transformed train split incl cross-validation for parameter selection
    print("# Ridge Regression")
    rreg = Ridge()
    param_grid = {
        "alpha": [
            0.00001,
            0.0001,
            0.001,
            0.01,
            0.1,
            1.0,
            2.5,
            5.0,
            10.0,
            25.0,
            50.0,
            100.0,
            250.0,
            500.0,
            1000.0,
            2500.0,
            5000.0,
            10000.0,
        ]
    }
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gsmodel = GridSearchCV(rreg, param_grid, scoring="neg_mean_squared_error", cv=5)
        gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))
    print("# Random Forest")
    rforest = RandomForestRegressor(n_estimators=100, random_state=13)
    param_grid = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    gsmodel = GridSearchCV(rforest, param_grid, scoring="neg_mean_squared_error", cv=5)
    gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))

In [None]:
for dsname in datasets:
    print("####", dsname)
    X, y, _ = load_regression_dataset(dsname)
    print(X.shape)

In [None]:
for dsname in datasets:
    print("####", dsname)
    rreg = Ridge()
    params = {
        "alpha": [
            0.00001,
            0.0001,
            0.001,
            0.01,
            0.1,
            1.0,
            2.5,
            5.0,
            10.0,
            25.0,
            50.0,
            100.0,
            250.0,
            500.0,
            1000.0,
            2500.0,
            5000.0,
            10000.0,
            25000.0,
            50000.0,
            100000.0,
        ]
    }
    rreg = test_model(dsname, rreg, params)

In [None]:
for dsname in datasets:
    if dsname == "california_housing":
        # takes too long because too many data points
        continue
    print("####", dsname)
    svr = SVR(gamma="scale")
    params = {"C": [1.0, 10.0, 25.0, 50.0, 100.0, 250.0]}
    svr = test_model(dsname, svr, params)

In [None]:
for dsname in datasets:
    print("####", dsname)
    rforest = RandomForestRegressor(n_estimators=100, random_state=13)
    params = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    rforest = test_model(dsname, rforest, params)

In [None]:
for dsname in datasets:
    print("####", dsname)
    test_autofeat(dsname, feateng_steps=1)

In [None]:
for dsname in datasets:
    print("####", dsname)
    test_autofeat(dsname, feateng_steps=2)

In [None]:
for dsname in datasets:
    print("####", dsname)
    test_autofeat(dsname, feateng_steps=3)