In [1]:
import os
import sys
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston, load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from autofeat import AutoFeatRegressor

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
datasets = ["diabetes", "boston", "concrete", "airfoil", "wine_quality"]

# same interface for loading all datasets - adapt the datapath
# to where you've downloaded (and renamed) the datasets
def load_regression_dataset(name, datapath="../datasets/regression/"):
    # load one of the datasets as X and y (and possibly units)
    units = {}
    if name == "boston":
        # sklearn boston housing dataset
        X, y = load_boston(True)

    elif name == "diabetes":
        # sklearn diabetes dataset
        X, y = load_diabetes(True)

    elif name == "concrete":
        # https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength
        # Cement (component 1) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Blast Furnace Slag (component 2) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Fly Ash (component 3) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Water (component 4) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Superplasticizer (component 5) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Coarse Aggregate (component 6) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Fine Aggregate (component 7)    -- quantitative -- kg in a m3 mixture -- Input Variable
        # Age -- quantitative -- Day (1~365) -- Input Variable
        # Concrete compressive strength -- quantitative -- MPa -- Output Variable
        df = pd.read_csv(os.path.join(datapath, "concrete.csv"))
        X = df.iloc[:, :8].to_numpy()
        y = df.iloc[:, 8].to_numpy()

    elif name == "forest_fires":
        # https://archive.ics.uci.edu/ml/datasets/Forest+Fires
        # 1. X - x-axis spatial coordinate within the Montesinho park map: 1 to 9
        # 2. Y - y-axis spatial coordinate within the Montesinho park map: 2 to 9
        # 3. month - month of the year: 'jan' to 'dec'
        # 4. day - day of the week: 'mon' to 'sun'
        # 5. FFMC - FFMC index from the FWI system: 18.7 to 96.20
        # 6. DMC - DMC index from the FWI system: 1.1 to 291.3
        # 7. DC - DC index from the FWI system: 7.9 to 860.6
        # 8. ISI - ISI index from the FWI system: 0.0 to 56.10
        # 9. temp - temperature in Celsius degrees: 2.2 to 33.30
        # 10. RH - relative humidity in %: 15.0 to 100
        # 11. wind - wind speed in km/h: 0.40 to 9.40
        # 12. rain - outside rain in mm/m2 : 0.0 to 6.4
        # 13. area - the burned area of the forest (in ha): 0.00 to 1090.84
        # (this output variable is very skewed towards 0.0, thus it may make sense to model with the logarithm transform).
        # --> first 4 are ignored
        df = pd.read_csv(os.path.join(datapath, "forest_fires.csv"))
        X = df.iloc[:, 4:12].to_numpy()
        y = df.iloc[:, 12].to_numpy()
        # perform transformation as they suggested
        y = np.log(y + 1)

    elif name == "wine_quality":
        # https://archive.ics.uci.edu/ml/datasets/Wine+Quality
        # Input variables (based on physicochemical tests):
        # 1 - fixed acidity
        # 2 - volatile acidity
        # 3 - citric acid
        # 4 - residual sugar
        # 5 - chlorides
        # 6 - free sulfur dioxide
        # 7 - total sulfur dioxide
        # 8 - density
        # 9 - pH
        # 10 - sulphates
        # 11 - alcohol
        # Output variable (based on sensory data):
        # 12 - quality (score between 0 and 10)
        df_red = pd.read_csv(os.path.join(datapath, "winequality-red.csv"), sep=";")
        df_white = pd.read_csv(os.path.join(datapath, "winequality-white.csv"), sep=";")
        # add additional categorical feature for red or white
        X = np.hstack([np.vstack([df_red.iloc[:, :-1].to_numpy(), df_white.iloc[:, :-1].to_numpy()]), np.array([[1]*len(df_red) + [0]*len(df_white)]).T])
        y = np.hstack([df_red["quality"].to_numpy(), df_white["quality"].to_numpy()])

    elif name == "airfoil":
        # https://archive.ics.uci.edu/ml/datasets/Airfoil+Self-Noise
        # This problem has the following inputs:
        # 1. Frequency, in Hertz.
        # 2. Angle of attack, in degrees.
        # 3. Chord length, in meters.
        # 4. Free-stream velocity, in meters per second.
        # 5. Suction side displacement thickness, in meters.
        # The only output is:
        # 6. Scaled sound pressure level, in decibels.
        units = {"x001": "Hz", "x003": "m", "x004": "m/sec", "x005": "m"}
        df = pd.read_csv(os.path.join(datapath, "airfoil_self_noise.tsv"), header=None, names=["x1", "x2", "x3", "x4", "x5", "y"], sep="\t")
        X = df.iloc[:, :5].to_numpy()
        y = df["y"].to_numpy()

    else:
        raise RuntimeError("Unknown dataset %r" % name)
    return np.array(X, dtype=float), np.array(y, dtype=float), units

def test_model(dataset, model, param_grid):
    # load data
    X, y, _ = load_regression_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    if model.__class__.__name__ == "SVR":
        sscaler = StandardScaler()
        X_train = sscaler.fit_transform(X_train)
        X_test = sscaler.transform(X_test)
    # train model on train split incl cross-validation for parameter selection
    gsmodel = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5)
    gsmodel.fit(X_train, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test)))
    return gsmodel.best_estimator_

def test_autofeat(dataset, feateng_steps=2):
    # load data
    X, y, units = load_regression_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    # run autofeat
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=feateng_steps, units=units)
    # fit autofeat on less data, otherwise ridge reg model with xval will overfit on new features
    X_train_tr = afreg.fit_transform(X_train, y_train)
    X_test_tr = afreg.transform(X_test)
    print("autofeat new features:", len(afreg.new_feat_cols_))
    print("autofeat MSE on training data:", mean_squared_error(y_train, afreg.predict(X_train_tr)))
    print("autofeat MSE on test data:", mean_squared_error(y_test, afreg.predict(X_test_tr)))
    print("autofeat R^2 on training data:", r2_score(y_train, afreg.predict(X_train_tr)))
    print("autofeat R^2 on test data:", r2_score(y_test, afreg.predict(X_test_tr)))
    # train rreg on transformed train split incl cross-validation for parameter selection
    print("# Ridge Regression")
    rreg = Ridge()
    param_grid = {"alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1., 2.5, 5., 10., 25., 50., 100., 250., 500., 1000., 2500., 5000., 10000.]}
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gsmodel = GridSearchCV(rreg, param_grid, scoring='neg_mean_squared_error', cv=5)
        gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))
    print("# Random Forest")
    rforest = RandomForestRegressor(n_estimators=100, random_state=13)
    param_grid = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    gsmodel = GridSearchCV(rforest, param_grid, scoring='neg_mean_squared_error', cv=5)
    gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))

In [3]:
for dsname in datasets:
    print("####", dsname)
    X, y, _ = load_regression_dataset(dsname)
    print(X.shape)

#### diabetes
(442, 10)
#### boston
(506, 13)
#### concrete
(1030, 8)
#### airfoil
(1503, 5)
#### wine_quality
(6497, 12)


In [4]:
for dsname in datasets:
    print("####", dsname)
    rreg = Ridge()
    params = {"alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1., 2.5, 5., 10., 25., 50., 100., 250., 500., 1000., 2500., 5000., 10000., 25000., 50000., 100000.]}
    rreg = test_model(dsname, rreg, params)

#### diabetes
best params: {'alpha': 0.01}
best score: -3043.1448766877706
MSE on training data: 2817.5756461735427
MSE on test data: 3119.632550355442
R^2 on training data: 0.541317737800587
R^2 on test data: 0.38300930348673157
#### boston
best params: {'alpha': 1e-05}
best score: -25.427148426837697
MSE on training data: 22.4278718761592
MSE on test data: 20.55805030529219
R^2 on training data: 0.7361592384229154
R^2 on test data: 0.7484031841564716
#### concrete
best params: {'alpha': 10000.0}
best score: -110.34480414200303
MSE on training data: 107.00865107837936
MSE on test data: 110.56229503996859
R^2 on training data: 0.6245955930727385
R^2 on test data: 0.5643057266127827
#### airfoil
best params: {'alpha': 0.001}
best score: -22.960476312553066
MSE on training data: 22.6317043193984
MSE on test data: 24.732769352718226
R^2 on training data: 0.5173357362628234
R^2 on test data: 0.5076580301932745
#### wine_quality
best params: {'alpha': 0.0001}
best score: -0.5401265191014878

In [5]:
for dsname in datasets:
    print("####", dsname)
    svr = SVR(gamma="scale")
    params = {"C": [1., 10., 25., 50., 100., 250.]}
    svr = test_model(dsname, svr, params)

#### diabetes
best params: {'C': 10.0}
best score: -3057.7070015538065
MSE on training data: 2577.0841482085507
MSE on test data: 3437.6800004513143
R^2 on training data: 0.5804681274187382
R^2 on test data: 0.32010692168648935
#### boston
best params: {'C': 100.0}
best score: -13.598043246310898
MSE on training data: 3.4469342608444284
MSE on test data: 9.636188588435925
R^2 on training data: 0.9594503764998732
R^2 on test data: 0.8820688572255264
#### concrete
best params: {'C': 100.0}
best score: -37.08377959823637
MSE on training data: 18.997096173790347
MSE on test data: 30.152373071635388
R^2 on training data: 0.9333549806432162
R^2 on test data: 0.8811781514521082
#### airfoil
best params: {'C': 250.0}
best score: -7.094189398840056
MSE on training data: 5.457762290823167
MSE on test data: 7.477589784074904
R^2 on training data: 0.8836028086716045
R^2 on test data: 0.8511476320667879
#### wine_quality
best params: {'C': 10.0}
best score: -0.4640006945371349
MSE on training data:

In [6]:
for dsname in datasets:
    print("####", dsname)
    rforest = RandomForestRegressor(n_estimators=100, random_state=13)
    params = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    rforest = test_model(dsname, rforest, params)

#### diabetes
best params: {'min_samples_leaf': 0.05}
best score: -3336.6571277553485
MSE on training data: 2472.319475907154
MSE on test data: 3268.607555103584
R^2 on training data: 0.5975231076301993
R^2 on test data: 0.35354551553768243
#### boston
best params: {'min_samples_leaf': 0.0001}
best score: -10.462810946604932
MSE on training data: 1.4186988960396048
MSE on test data: 10.583239343137262
R^2 on training data: 0.9833104719321342
R^2 on test data: 0.8704785093673091
#### concrete
best params: {'min_samples_leaf': 0.0001}
best score: -28.701648337050347
MSE on training data: 4.169688233206215
MSE on test data: 27.527437198114896
R^2 on training data: 0.9853720299949222
R^2 on test data: 0.8915222703733743
#### airfoil
best params: {'min_samples_leaf': 0.0001}
best score: -3.770457737476125
MSE on training data: 0.4389576916890236
MSE on test data: 3.316904702700349
R^2 on training data: 0.9906383899294207
R^2 on test data: 0.9339721576787678
#### wine_quality
best params: {'

In [7]:
for dsname in datasets:
    print("####", dsname)
    test_autofeat(dsname, feateng_steps=1)

#### diabetes
[AutoFeat] The 1 step feature engineering process could generate up to 70 features.
[AutoFeat] With 353 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 45 transformed features from 10 original features - done.
[feateng] Generated altogether 45 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 36 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 8 features after 5 feature selection runs
[featsel] 8 features after correlation filtering
[featsel] 7 features after noise filtering
[AutoFeat] Computing 2 new features.
[AutoFeat]     2/    2 new features ...done.
[AutoFeat] Final dataframe with 12 feature column

best params: {'min_samples_leaf': 0.0001}
best score: -3.906333436423625
MSE on training data: 0.4506220271923523
MSE on test data: 3.345056032207664
R^2 on training data: 0.9903896257255304
R^2 on test data: 0.9334117642660995
#### wine_quality
[AutoFeat] The 1 step feature engineering process could generate up to 84 features.
[AutoFeat] With 5197 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 59 transformed features from 12 original features - done.
[feateng] Generated altogether 59 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 21 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 24 features after 5 feature sele

In [8]:
for dsname in datasets:
    print("####", dsname)
    test_autofeat(dsname, feateng_steps=2)

#### diabetes
[AutoFeat] The 2 step feature engineering process could generate up to 2485 features.
[AutoFeat] With 353 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 45 transformed features from 10 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 5793 feature combinations from 1485 original feature tuples - done.
[feateng] Generated altogether 5950 new features in 2 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 925 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 29 features after 5 feature selection runs
[featsel] 29 features after correlation filtering
[featsel] 8 features after nois

[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 130 features after 5 feature selection runs
[featsel] 59 features after correlation filtering
[featsel] 45 features after noise filtering
[AutoFeat] Computing 41 new features.
[AutoFeat]    41/   41 new features ...done.
[AutoFeat] Final dataframe with 47 feature columns (42 new).
[AutoFeat] Training final regression model.
[AutoFeat] Trained model: largest coefficients:
185.55020101090565
-53080.637721 * x002**3/x000
-45761.736451 * 1/(x000*x003)
10426.352270 * x002/x000
-6656.579959 * x004/x000
-3980.177758 * x002*x004
1108.997024 * x002/x003
-1070.619190 * sqrt(x004)/x003
435.305316 * x004
-157.179235 * 1/(x000*x002)
-119.244913 * x002
64.634588 * x001/x000
27.645977 * x003/x000
22.100156 * sqrt(x001)*x002**3
-16.296350 * 1/(x000*x004)
15.344922 * 1/(x002*x003)
11.012043 * x001**2*x002**3
-10.991819 * sqrt(x000)/x003
-5.249177 * x001**3*x004**3
4.167449 * x003**2*x00

In [9]:
for dsname in datasets:
    print("####", dsname)
    test_autofeat(dsname, feateng_steps=3)

#### diabetes
[AutoFeat] The 3 step feature engineering process could generate up to 60445 features.
[AutoFeat] With 353 data points this new feature matrix would use about 0.09 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 45 transformed features from 10 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 5793 feature combinations from 1485 original feature tuples - done.
[feateng] Step 3: transformation of new features
[feateng] Generated 24336 transformed features from 5793 original features - done.
[feateng] Generated altogether 32161 new features in 3 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 14810 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/

[feateng] Step 1: transformation of original features
[feateng] Generated 21 transformed features from 5 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 484 feature combinations from 325 original feature tuples - done.
[feateng] Step 3: transformation of new features
[feateng] Generated 1610 transformed features from 484 original features - done.
[feateng] Generated altogether 2355 new features in 3 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 1057 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 113 features after 5 feature selection runs
[featsel] 59 features after correlation filtering
[featsel] 44 features after noise filtering
[AutoFeat] Computing 43 new features.
[AutoFeat]