In [1]:
import os
import sys
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston, load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from autofeat import FeatureSelector, AutoFeatRegression

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
datasets = ["boston", "diabetes", "concrete", "forest_fires", "wine_quality", "airfoil"]

# same interface for loading all datasets - adapt the datapath
# to where you've downloaded (and renamed) the datasets
def load_regression_dataset(name, datapath="../datasets/regression/"):
    # load one of the datasets as X and y (and possibly units)
    units = {}
    if name == "boston":
        # sklearn boston housing dataset
        X, y = load_boston(True)

    elif name == "diabetes":
        # sklearn diabetes dataset
        X, y = load_diabetes(True)

    elif name == "concrete":
        # https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength
        # Cement (component 1) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Blast Furnace Slag (component 2) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Fly Ash (component 3) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Water (component 4) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Superplasticizer (component 5) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Coarse Aggregate (component 6) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Fine Aggregate (component 7)    -- quantitative -- kg in a m3 mixture -- Input Variable
        # Age -- quantitative -- Day (1~365) -- Input Variable
        # Concrete compressive strength -- quantitative -- MPa -- Output Variable
        df = pd.read_csv(os.path.join(datapath, "concrete.csv"))
        X = df.iloc[:, :8].to_numpy()
        y = df.iloc[:, 8].to_numpy()

    elif name == "forest_fires":
        # https://archive.ics.uci.edu/ml/datasets/Forest+Fires
        # 1. X - x-axis spatial coordinate within the Montesinho park map: 1 to 9
        # 2. Y - y-axis spatial coordinate within the Montesinho park map: 2 to 9
        # 3. month - month of the year: 'jan' to 'dec'
        # 4. day - day of the week: 'mon' to 'sun'
        # 5. FFMC - FFMC index from the FWI system: 18.7 to 96.20
        # 6. DMC - DMC index from the FWI system: 1.1 to 291.3
        # 7. DC - DC index from the FWI system: 7.9 to 860.6
        # 8. ISI - ISI index from the FWI system: 0.0 to 56.10
        # 9. temp - temperature in Celsius degrees: 2.2 to 33.30
        # 10. RH - relative humidity in %: 15.0 to 100
        # 11. wind - wind speed in km/h: 0.40 to 9.40
        # 12. rain - outside rain in mm/m2 : 0.0 to 6.4
        # 13. area - the burned area of the forest (in ha): 0.00 to 1090.84
        # (this output variable is very skewed towards 0.0, thus it may make sense to model with the logarithm transform).
        # --> first 4 are ignored
        df = pd.read_csv(os.path.join(datapath, "forest_fires.csv"))
        X = df.iloc[:, 4:12].to_numpy()
        y = df.iloc[:, 12].to_numpy()
        # perform transformation as they suggested
        y = np.log(y + 1)

    elif name == "wine_quality":
        # https://archive.ics.uci.edu/ml/datasets/Wine+Quality
        # Input variables (based on physicochemical tests):
        # 1 - fixed acidity
        # 2 - volatile acidity
        # 3 - citric acid
        # 4 - residual sugar
        # 5 - chlorides
        # 6 - free sulfur dioxide
        # 7 - total sulfur dioxide
        # 8 - density
        # 9 - pH
        # 10 - sulphates
        # 11 - alcohol
        # Output variable (based on sensory data):
        # 12 - quality (score between 0 and 10)
        df_red = pd.read_csv(os.path.join(datapath, "winequality-red.csv"), sep=";")
        df_white = pd.read_csv(os.path.join(datapath, "winequality-white.csv"), sep=";")
        # add additional categorical feature for red or white
        X = np.hstack([np.vstack([df_red.iloc[:, :-1].to_numpy(), df_white.iloc[:, :-1].to_numpy()]), np.array([[1]*len(df_red) + [0]*len(df_white)]).T])
        y = np.hstack([df_red["quality"].to_numpy(), df_white["quality"].to_numpy()])

    elif name == "airfoil":
        # https://archive.ics.uci.edu/ml/datasets/Airfoil+Self-Noise
        # This problem has the following inputs:
        # 1. Frequency, in Hertz.
        # 2. Angle of attack, in degrees.
        # 3. Chord length, in meters.
        # 4. Free-stream velocity, in meters per second.
        # 5. Suction side displacement thickness, in meters.
        # The only output is:
        # 6. Scaled sound pressure level, in decibels.
        units = {"x001": "Hz", "x003": "m", "x004": "m/sec", "x005": "m"}
        df = pd.read_csv(os.path.join(datapath, "airfoil_self_noise.tsv"), header=None, names=["x1", "x2", "x3", "x4", "x5", "y"], sep="\t")
        X = df.iloc[:, :5].to_numpy()
        y = df["y"].to_numpy()

    else:
        raise RuntimeError("Unknown dataset %r" % name)
    return np.array(X, dtype=float), np.array(y, dtype=float), units

def test_model(dataset, model, param_grid):
    # load data
    X, y, _ = load_regression_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    if model.__class__.__name__ == "SVR":
        sscaler = StandardScaler()
        X_train = sscaler.fit_transform(X_train)
        X_test = sscaler.transform(X_test)
    # train model on train split incl cross-validation for parameter selection
    gsmodel = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5, iid=False)
    gsmodel.fit(X_train, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test)))
    return gsmodel.best_estimator_

def test_autofeat(dataset, feateng_steps=3):
    # load data
    X, y, units = load_regression_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    # run autofeat
    afreg = AutoFeatRegression(verbose=1, feateng_steps=feateng_steps, units=units, featsel_w_thr=0.01)
    # fit autofeat on less data, otherwise ridge reg model with xval will overfit on new features
    X_train_tr = afreg.fit_transform(X_train, y_train)
    X_test_tr = afreg.transform(X_test)
    print("autofeat new features:", len(afreg.new_feat_cols_))
    print("autofeat MSE on training data:", mean_squared_error(y_train, afreg.predict(X_train_tr)))
    print("autofeat MSE on test data:", mean_squared_error(y_test, afreg.predict(X_test_tr)))
    print("autofeat R^2 on training data:", r2_score(y_train, afreg.predict(X_train_tr)))
    print("autofeat R^2 on test data:", r2_score(y_test, afreg.predict(X_test_tr)))
    # train rreg on transformed train split incl cross-validation for parameter selection
    rreg = Ridge()
    param_grid = {"alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1., 2.5, 5., 10., 25., 50., 100., 250., 500., 1000., 2500., 5000., 10000., 25000., 50000., 100000.]}
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gsmodel = GridSearchCV(rreg, param_grid, scoring='neg_mean_squared_error', cv=5, iid=False)
        gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))
    

In [3]:
for dsname in datasets:
    print("####", dsname)
    X, y, _ = load_regression_dataset(dsname)
    print(X.shape)

#### boston
(506, 13)
#### diabetes
(442, 10)
#### concrete
(1030, 8)
#### forest_fires
(517, 8)
#### wine_quality
(6497, 12)
#### airfoil
(1503, 5)


In [4]:
for dsname in datasets:
    print("####", dsname)
    rreg = Ridge()
    params = {"alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1., 2.5, 5., 10., 25., 50., 100., 250., 500., 1000., 2500., 5000., 10000., 25000., 50000., 100000.]}
    rreg = test_model(dsname, rreg, params)

#### boston
best params: {'alpha': 1e-05}
best score: -25.427148426837693
MSE on training data: 22.4278718761592
MSE on test data: 20.5580503052922
R^2 on training data: 0.7361592384229154
R^2 on test data: 0.7484031841564716
#### diabetes
best params: {'alpha': 0.01}
best score: -3043.14487668777
MSE on training data: 2817.5756461735427
MSE on test data: 3119.632550355442
R^2 on training data: 0.541317737800587
R^2 on test data: 0.38300930348673157
#### concrete
best params: {'alpha': 10000.0}
best score: -110.34480414200297
MSE on training data: 107.00865107837934
MSE on test data: 110.56229503996865
R^2 on training data: 0.6245955930727385
R^2 on test data: 0.5643057266127824
#### forest_fires
best params: {'alpha': 100000.0}
best score: -1.8969026573257473
MSE on training data: 1.8469604643703812
MSE on test data: 2.328893858372984
R^2 on training data: 0.010801489514856932
R^2 on test data: -0.0401625480215837
#### wine_quality
best params: {'alpha': 0.0001}
best score: -0.5401265

In [5]:
for dsname in datasets:
    print("####", dsname)
    svr = SVR(gamma="scale")
    params = {"C": [1., 10., 25., 50., 100., 250.]}
    svr = test_model(dsname, svr, params)

#### boston
best params: {'C': 100.0}
best score: -13.598043246310898
MSE on training data: 3.4469342608444284
MSE on test data: 9.636188588435925
R^2 on training data: 0.9594503764998732
R^2 on test data: 0.8820688572255264
#### diabetes
best params: {'C': 10.0}
best score: -3057.7070015538065
MSE on training data: 2577.0841482085507
MSE on test data: 3437.6800004513143
R^2 on training data: 0.5804681274187382
R^2 on test data: 0.32010692168648935
#### concrete
best params: {'C': 100.0}
best score: -37.08377959823637
MSE on training data: 18.997096173790347
MSE on test data: 30.152373071635388
R^2 on training data: 0.9333549806432162
R^2 on test data: 0.8811781514521082
#### forest_fires
best params: {'C': 1.0}
best score: -2.186927498252301
MSE on training data: 1.8539927476295148
MSE on test data: 3.12614582192515
R^2 on training data: 0.007035126206362374
R^2 on test data: -0.396242165322382
#### wine_quality
best params: {'C': 10.0}
best score: -0.4640006945371349
MSE on training 

In [6]:
for dsname in datasets:
    print("####", dsname)
    rforest = RandomForestRegressor(n_estimators=100, random_state=13)
    params = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    rforest = test_model(dsname, rforest, params)

#### boston
best params: {'min_samples_leaf': 0.0001}
best score: -10.462810946604932
MSE on training data: 1.4186988960396048
MSE on test data: 10.583239343137262
R^2 on training data: 0.9833104719321342
R^2 on test data: 0.8704785093673091
#### diabetes
best params: {'min_samples_leaf': 0.05}
best score: -3336.6571277553485
MSE on training data: 2472.319475907154
MSE on test data: 3268.607555103584
R^2 on training data: 0.5975231076301993
R^2 on test data: 0.35354551553768243
#### concrete
best params: {'min_samples_leaf': 0.0001}
best score: -28.701648337050347
MSE on training data: 4.169688233206215
MSE on test data: 27.527437198114896
R^2 on training data: 0.9853720299949222
R^2 on test data: 0.8915222703733743
#### forest_fires
best params: {'min_samples_leaf': 0.2}
best score: -1.8836034096444383
MSE on training data: 1.828520757317779
MSE on test data: 2.2946977602510548
R^2 on training data: 0.020677462012424486
R^2 on test data: -0.024889417205844477
#### wine_quality
best pa

In [7]:
for dsname in datasets:
    print("####", dsname)
    test_autofeat(dsname, feateng_steps=1)

#### boston
[AutoFeatRegression] The 1 step feature engineering process could generate up to 91 features.
[AutoFeatRegression] With 404 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 64 transformed features from 13 original features - done.
[feateng] Generated a total of 64 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 17 new features selected.
[AutoFeatRegression] Computing 16 new features.
[AutoFeatRegression]    16/   16 new features ...done.
[AutoFeatRegression] Training final regression model.
[AutoFeatRegression] Trained model: largest coefficients:
21.166797229359744
542.796203 * 1/x009
93.728034 * 1/x010
21.157874 * 1/x012
-5.839516 * log(x012)
2.403238 * 1/x007
-1.294779 * 1/x006
0.942274 *

best params: {'alpha': 0.001}
best score: -0.5245556594266705
MSE on training data: 0.5178094322818836
MSE on test data: 0.5235956370832444
R^2 on training data: 0.31501913114489033
R^2 on test data: 0.3352290386367043
#### airfoil
[AutoFeatRegression] Applying the Pi Theorem
[AutoFeatRegression] Pi Theorem 1:  x001 * x003 / x004
[AutoFeatRegression] The 1 step feature engineering process could generate up to 35 features.
[AutoFeatRegression] With 1202 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 21 transformed features from 5 original features - done.
[feateng] Generated a total of 22 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 12 new features selected.
[AutoFeatRegression] Computing 9 new feat

In [8]:
for dsname in datasets:
    print("####", dsname)
    test_autofeat(dsname, feateng_steps=2)

#### boston
[AutoFeatRegression] The 2 step feature engineering process could generate up to 4186 features.
[AutoFeatRegression] With 404 data points this new feature matrix would use about 0.01 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 64 transformed features from 13 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 11659 feature combinations from 2926 original feature tuples - done.
[feateng] Generated a total of 2945 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 18 new features selected.
[AutoFeatRegression] Computing 18 new features.
[AutoFeatRegression]    18/   18 new features ...done.
[AutoFeatRegression] Training final regression model.
[AutoFeatRegression] Trained model: largest coefficients:
13.03

[feateng] Generated 11059 feature combinations from 2775 original feature tuples - done.
[feateng] Generated a total of 2797 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 24 new features selected.
[AutoFeatRegression] Computing 24 new features.
[AutoFeatRegression]    24/   24 new features ...done.
[AutoFeatRegression] Training final regression model.
[AutoFeatRegression] Trained model: largest coefficients:
-5779.468422222276
2116.818547 * exp(x007)/x007
30.853871 * x007
-8.007633 * x004**2*log(x009)
7.103355 * x009**2/x006
-3.747951 * x008**3*log(x007)
-3.414562 * exp(x001)/x000
3.200993 * log(x009)/x005
0.734957 * x001
-0.719951 * log(x001)/x006
0.589573 * x001**2*log(x009)
0.513849 * exp(x011)/x005
0.367517 * log(x009)/x003
-0.290091 * x002**3*log(x003)
0.233455 * x011
-0.137500 * sqrt(x004

In [9]:
for dsname in datasets:
    print("####", dsname)
    test_autofeat(dsname, feateng_steps=3)

#### boston
[AutoFeatRegression] The 3 step feature engineering process could generate up to 102466 features.
[AutoFeatRegression] With 404 data points this new feature matrix would use about 0.17 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 64 transformed features from 13 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 11659 feature combinations from 2926 original feature tuples - done.
[feateng] Step 3: transformation of new features
[feateng] Generated 49568 transformed features from 11659 original features - done.
[feateng] Generated a total of 52513 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 19 new features selected.
[AutoFeatRegression] Computing 19 new features.
[AutoFeatRegression]    19/   19 new

best params: {'alpha': 1e-05}
best score: -1.9787928981159926
MSE on training data: 1.6131791121768742
MSE on test data: 3.2417999332340495
R^2 on training data: 0.13601053964352539
R^2 on test data: -0.4478971923111519
#### wine_quality
[AutoFeatRegression] The 3 step feature engineering process could generate up to 87234 features.
[AutoFeatRegression] With 5197 data points this new feature matrix would use about 1.81 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 63 transformed features from 12 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 11059 feature combinations from 2775 original feature tuples - done.
[feateng] Step 3: transformation of new features
[feateng] Generated 50887 transformed features from 11059 original features - done.
[feateng] Generated a total of 53684 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[f