In [1]:
import os
import sys
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston, load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from autofeat import FeatureSelector, AutoFeatRegression

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
datasets = ["diabetes", "boston", "concrete", "airfoil", "wine_quality", "forest_fires"]

# same interface for loading all datasets - adapt the datapath
# to where you've downloaded (and renamed) the datasets
def load_regression_dataset(name, datapath="../datasets/regression/"):
    # load one of the datasets as X and y (and possibly units)
    units = {}
    if name == "boston":
        # sklearn boston housing dataset
        X, y = load_boston(True)

    elif name == "diabetes":
        # sklearn diabetes dataset
        X, y = load_diabetes(True)

    elif name == "concrete":
        # https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength
        # Cement (component 1) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Blast Furnace Slag (component 2) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Fly Ash (component 3) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Water (component 4) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Superplasticizer (component 5) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Coarse Aggregate (component 6) -- quantitative -- kg in a m3 mixture -- Input Variable
        # Fine Aggregate (component 7)    -- quantitative -- kg in a m3 mixture -- Input Variable
        # Age -- quantitative -- Day (1~365) -- Input Variable
        # Concrete compressive strength -- quantitative -- MPa -- Output Variable
        df = pd.read_csv(os.path.join(datapath, "concrete.csv"))
        X = df.iloc[:, :8].to_numpy()
        y = df.iloc[:, 8].to_numpy()

    elif name == "forest_fires":
        # https://archive.ics.uci.edu/ml/datasets/Forest+Fires
        # 1. X - x-axis spatial coordinate within the Montesinho park map: 1 to 9
        # 2. Y - y-axis spatial coordinate within the Montesinho park map: 2 to 9
        # 3. month - month of the year: 'jan' to 'dec'
        # 4. day - day of the week: 'mon' to 'sun'
        # 5. FFMC - FFMC index from the FWI system: 18.7 to 96.20
        # 6. DMC - DMC index from the FWI system: 1.1 to 291.3
        # 7. DC - DC index from the FWI system: 7.9 to 860.6
        # 8. ISI - ISI index from the FWI system: 0.0 to 56.10
        # 9. temp - temperature in Celsius degrees: 2.2 to 33.30
        # 10. RH - relative humidity in %: 15.0 to 100
        # 11. wind - wind speed in km/h: 0.40 to 9.40
        # 12. rain - outside rain in mm/m2 : 0.0 to 6.4
        # 13. area - the burned area of the forest (in ha): 0.00 to 1090.84
        # (this output variable is very skewed towards 0.0, thus it may make sense to model with the logarithm transform).
        # --> first 4 are ignored
        df = pd.read_csv(os.path.join(datapath, "forest_fires.csv"))
        X = df.iloc[:, 4:12].to_numpy()
        y = df.iloc[:, 12].to_numpy()
        # perform transformation as they suggested
        y = np.log(y + 1)

    elif name == "wine_quality":
        # https://archive.ics.uci.edu/ml/datasets/Wine+Quality
        # Input variables (based on physicochemical tests):
        # 1 - fixed acidity
        # 2 - volatile acidity
        # 3 - citric acid
        # 4 - residual sugar
        # 5 - chlorides
        # 6 - free sulfur dioxide
        # 7 - total sulfur dioxide
        # 8 - density
        # 9 - pH
        # 10 - sulphates
        # 11 - alcohol
        # Output variable (based on sensory data):
        # 12 - quality (score between 0 and 10)
        df_red = pd.read_csv(os.path.join(datapath, "winequality-red.csv"), sep=";")
        df_white = pd.read_csv(os.path.join(datapath, "winequality-white.csv"), sep=";")
        # add additional categorical feature for red or white
        X = np.hstack([np.vstack([df_red.iloc[:, :-1].to_numpy(), df_white.iloc[:, :-1].to_numpy()]), np.array([[1]*len(df_red) + [0]*len(df_white)]).T])
        y = np.hstack([df_red["quality"].to_numpy(), df_white["quality"].to_numpy()])

    elif name == "airfoil":
        # https://archive.ics.uci.edu/ml/datasets/Airfoil+Self-Noise
        # This problem has the following inputs:
        # 1. Frequency, in Hertz.
        # 2. Angle of attack, in degrees.
        # 3. Chord length, in meters.
        # 4. Free-stream velocity, in meters per second.
        # 5. Suction side displacement thickness, in meters.
        # The only output is:
        # 6. Scaled sound pressure level, in decibels.
        units = {"x001": "Hz", "x003": "m", "x004": "m/sec", "x005": "m"}
        df = pd.read_csv(os.path.join(datapath, "airfoil_self_noise.tsv"), header=None, names=["x1", "x2", "x3", "x4", "x5", "y"], sep="\t")
        X = df.iloc[:, :5].to_numpy()
        y = df["y"].to_numpy()

    else:
        raise RuntimeError("Unknown dataset %r" % name)
    return np.array(X, dtype=float), np.array(y, dtype=float), units

def test_model(dataset, model, param_grid):
    # load data
    X, y, _ = load_regression_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    if model.__class__.__name__ == "SVR":
        sscaler = StandardScaler()
        X_train = sscaler.fit_transform(X_train)
        X_test = sscaler.transform(X_test)
    # train model on train split incl cross-validation for parameter selection
    gsmodel = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5, iid=False)
    gsmodel.fit(X_train, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test)))
    return gsmodel.best_estimator_

def test_autofeat(dataset, feateng_steps=2):
    # load data
    X, y, units = load_regression_dataset(dataset)
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    # run autofeat
    afreg = AutoFeatRegression(verbose=1, feateng_steps=feateng_steps, units=units, featsel_w_thr=0.01)
    # fit autofeat on less data, otherwise ridge reg model with xval will overfit on new features
    X_train_tr = afreg.fit_transform(X_train, y_train)
    X_test_tr = afreg.transform(X_test)
    print("autofeat new features:", len(afreg.new_feat_cols_))
    print("autofeat MSE on training data:", mean_squared_error(y_train, afreg.predict(X_train_tr)))
    print("autofeat MSE on test data:", mean_squared_error(y_test, afreg.predict(X_test_tr)))
    print("autofeat R^2 on training data:", r2_score(y_train, afreg.predict(X_train_tr)))
    print("autofeat R^2 on test data:", r2_score(y_test, afreg.predict(X_test_tr)))
    # train rreg on transformed train split incl cross-validation for parameter selection
    print("# Ridge Regression")
    rreg = Ridge()
    param_grid = {"alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1., 2.5, 5., 10., 25., 50., 100., 250., 500., 1000., 2500., 5000., 10000., 25000., 50000., 100000.]}
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gsmodel = GridSearchCV(rreg, param_grid, scoring='neg_mean_squared_error', cv=5, iid=False)
        gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))
    print("# Random Forest")
    rforest = RandomForestRegressor(n_estimators=100, random_state=13)
    param_grid = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    gsmodel = GridSearchCV(rforest, param_grid, scoring='neg_mean_squared_error', cv=5, iid=False)
    gsmodel.fit(X_train_tr, y_train)
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
    print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
    print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
    print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))

In [3]:
for dsname in datasets:
    print("####", dsname)
    X, y, _ = load_regression_dataset(dsname)
    print(X.shape)

#### diabetes
(442, 10)
#### boston
(506, 13)
#### concrete
(1030, 8)
#### airfoil
(1503, 5)
#### wine_quality
(6497, 12)
#### forest_fires
(517, 8)


In [4]:
for dsname in datasets:
    print("####", dsname)
    rreg = Ridge()
    params = {"alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1., 2.5, 5., 10., 25., 50., 100., 250., 500., 1000., 2500., 5000., 10000., 25000., 50000., 100000.]}
    rreg = test_model(dsname, rreg, params)

#### diabetes
best params: {'alpha': 0.01}
best score: -3043.14487668777
MSE on training data: 2817.5756461735427
MSE on test data: 3119.632550355442
R^2 on training data: 0.541317737800587
R^2 on test data: 0.38300930348673157
#### boston
best params: {'alpha': 1e-05}
best score: -25.427148426837693
MSE on training data: 22.4278718761592
MSE on test data: 20.5580503052922
R^2 on training data: 0.7361592384229154
R^2 on test data: 0.7484031841564716
#### concrete
best params: {'alpha': 10000.0}
best score: -110.34480414200297
MSE on training data: 107.00865107837934
MSE on test data: 110.56229503996865
R^2 on training data: 0.6245955930727385
R^2 on test data: 0.5643057266127824
#### airfoil
best params: {'alpha': 0.001}
best score: -22.960476312553066
MSE on training data: 22.6317043193984
MSE on test data: 24.732769352718233
R^2 on training data: 0.5173357362628234
R^2 on test data: 0.5076580301932743
#### wine_quality
best params: {'alpha': 0.0001}
best score: -0.5401265191014872
MS

In [5]:
for dsname in datasets:
    print("####", dsname)
    svr = SVR(gamma="scale")
    params = {"C": [1., 10., 25., 50., 100., 250.]}
    svr = test_model(dsname, svr, params)

#### diabetes
best params: {'C': 10.0}
best score: -3057.7070015538065
MSE on training data: 2577.0841482085507
MSE on test data: 3437.6800004513143
R^2 on training data: 0.5804681274187382
R^2 on test data: 0.32010692168648935
#### boston
best params: {'C': 100.0}
best score: -13.598043246310898
MSE on training data: 3.4469342608444284
MSE on test data: 9.636188588435925
R^2 on training data: 0.9594503764998732
R^2 on test data: 0.8820688572255264
#### concrete
best params: {'C': 100.0}
best score: -37.08377959823637
MSE on training data: 18.997096173790347
MSE on test data: 30.152373071635388
R^2 on training data: 0.9333549806432162
R^2 on test data: 0.8811781514521082
#### airfoil
best params: {'C': 250.0}
best score: -7.094189398840056
MSE on training data: 5.457762290823167
MSE on test data: 7.477589784074904
R^2 on training data: 0.8836028086716045
R^2 on test data: 0.8511476320667879
#### wine_quality
best params: {'C': 10.0}
best score: -0.4640006945371349
MSE on training data:

In [6]:
for dsname in datasets:
    print("####", dsname)
    rforest = RandomForestRegressor(n_estimators=100, random_state=13)
    params = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    rforest = test_model(dsname, rforest, params)

#### diabetes
best params: {'min_samples_leaf': 0.05}
best score: -3336.6571277553485
MSE on training data: 2472.319475907154
MSE on test data: 3268.607555103584
R^2 on training data: 0.5975231076301993
R^2 on test data: 0.35354551553768243
#### boston
best params: {'min_samples_leaf': 0.0001}
best score: -10.462810946604932
MSE on training data: 1.4186988960396048
MSE on test data: 10.583239343137262
R^2 on training data: 0.9833104719321342
R^2 on test data: 0.8704785093673091
#### concrete
best params: {'min_samples_leaf': 0.0001}
best score: -28.701648337050347
MSE on training data: 4.169688233206215
MSE on test data: 27.527437198114896
R^2 on training data: 0.9853720299949222
R^2 on test data: 0.8915222703733743
#### airfoil
best params: {'min_samples_leaf': 0.0001}
best score: -3.770457737476125
MSE on training data: 0.4389576916890236
MSE on test data: 3.316904702700349
R^2 on training data: 0.9906383899294207
R^2 on test data: 0.9339721576787678
#### wine_quality
best params: {'

In [7]:
for dsname in datasets:
    print("####", dsname)
    test_autofeat(dsname, feateng_steps=1)

#### diabetes
[AutoFeatRegression] The 1 step feature engineering process could generate up to 70 features.
[AutoFeatRegression] With 353 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 50 transformed features from 10 original features - done.
[feateng] Generated a total of 50 additional features
[featsel] Scaling data...done.
[featsel] 39/60 features after univariate filtering
[featsel] Feature selection run 1/3
[featsel] Feature selection run 2/3
[featsel] Feature selection run 3/3
[featsel] 20 features after 3 feature selection runs
[featsel] 16 features after correlation filtering
[featsel] 8 features after noise filtering
[featsel] 13 final features selected (including 10 original keep features).
[AutoFeatRegression] Computing 3 new features.
[AutoFeatRegression]     3/    3 new features ...done.
[AutoFeatRegression] Training final regression model.
[AutoFeatRegression] Trained model: l

best params: {'alpha': 1e-05}
best score: -22.31511878239506
MSE on training data: 21.65038669763258
MSE on test data: 24.040480113582436
R^2 on training data: 0.5382642063734873
R^2 on test data: 0.5214390606477008
# Random Forest
best params: {'min_samples_leaf': 0.0001}
best score: -3.8499058438152183
MSE on training data: 0.44608849616581564
MSE on test data: 3.4021282771618115
R^2 on training data: 0.9904863119222114
R^2 on test data: 0.9322756577063652
#### wine_quality
[AutoFeatRegression] The 1 step feature engineering process could generate up to 84 features.
[AutoFeatRegression] With 5197 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 63 transformed features from 12 original features - done.
[feateng] Generated a total of 63 additional features
[featsel] Scaling data...done.
[featsel] 60/75 features after univariate filtering
[featsel] Feature selection run 1/3
[featsel] Feature s

In [8]:
for dsname in datasets:
    print("####", dsname)
    test_autofeat(dsname, feateng_steps=2)

#### diabetes
[AutoFeatRegression] The 2 step feature engineering process could generate up to 2485 features.
[AutoFeatRegression] With 353 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 50 transformed features from 10 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 7041 feature combinations from 1770 original feature tuples - done.
[feateng] Generated a total of 1781 additional features
[featsel] Scaling data...done.
[featsel] 918/1791 features after univariate filtering
[featsel] Feature selection run 1/3
[featsel] Feature selection run 2/3
[featsel] Feature selection run 3/3
[featsel] 17 features after 3 feature selection runs
[featsel] 17 features after correlation filtering
[featsel] 6 features after noise filtering
[featsel] 16 final features selected (including 10 original keep features).
[AutoFeatRegression] Computing 6 new features.
[Au

[AutoFeatRegression]    39/   39 new features ...done.
[AutoFeatRegression] Training final regression model.
[AutoFeatRegression] Trained model: largest coefficients:
158.2080194630043
18856460.524305 * x004**5
-15382381.199017 * x004**3/x000
-150973.482290 * x004**2/x003
25919.792241 * sqrt(x004)/x000
-20776.979268 * x002**2*x004
-13645.343423 * x002**2/x000
10221.623300 * x002**3/x003
1606.320052 * log(x002)/x000
-1129.838461 * sqrt(x004)/x003
214.235439 * x004
83.982768 * sqrt(x000)*x004**3
34.157947 * sqrt(x001)/x003
-27.762247 * x002
16.290260 * x001**2*x002**3
-9.221254 * 1/(x000*x004)
-5.651801 * sqrt(x000)/x003
-5.034130 * sqrt(x001)*x002**2
4.463691 * 1/(x002*x003)
-1.112450 * x001
-0.755724 * sqrt(x000)*sqrt(x002)
0.458290 * sqrt(x000)*sqrt(x004)
-0.422113 * x001**3*x004**2
-0.333056 * x001**3/x000
0.218247 * x002**2/x004
0.168858 * 1/(x003*x004)
-0.154612 * x002/x004
-0.073704 * sqrt(x003)/x002
0.068368 * x003
0.050428 * x001**3/x003
-0.027407 * sqrt(x000)*x001
0.014329 * sq

In [9]:
for dsname in datasets:
    print("####", dsname)
    test_autofeat(dsname, feateng_steps=3)

#### diabetes
[AutoFeatRegression] The 3 step feature engineering process could generate up to 60445 features.
[AutoFeatRegression] With 353 data points this new feature matrix would use about 0.09 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 50 transformed features from 10 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 7041 feature combinations from 1770 original feature tuples - done.
[feateng] Step 3: transformation of new features
[feateng] Generated 31880 transformed features from 7041 original features - done.
[feateng] Generated a total of 33661 additional features
[featsel] Scaling data...done.
[featsel] 17580/33671 features after univariate filtering
[featsel] Feature selection run 1/3
[featsel] Feature selection run 2/3
[featsel] Feature selection run 3/3
[featsel] 34 features after 3 feature selection runs
[featsel] 31 features after correlation filtering
[featsel] 7 features after noise

best params: {'min_samples_leaf': 0.0001}
best score: -25.13205992325954
MSE on training data: 3.9922058201135795
MSE on test data: 22.78519350316074
R^2 on training data: 0.9859946682522559
R^2 on test data: 0.910210091751822
#### airfoil
[AutoFeatRegression] Applying the Pi Theorem
[AutoFeatRegression] Pi Theorem 1:  x001 * x003 / x004
[AutoFeatRegression] The 3 step feature engineering process could generate up to 14910 features.
[AutoFeatRegression] With 1202 data points this new feature matrix would use about 0.07 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 21 transformed features from 5 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 509 feature combinations from 325 original feature tuples - done.
[feateng] Step 3: transformation of new features
[feateng] Generated 1906 transformed features from 509 original features - done.
[feateng] Generated a total of 2239 additional features
[featsel] S

best params: {'min_samples_leaf': 0.2}
best score: -1.8836034096444383
MSE on training data: 1.828520757317779
MSE on test data: 2.2946977602510548
R^2 on training data: 0.020677462012424486
R^2 on test data: -0.024889417205844477
