# Autofeat 

## Automated Feature Engineering and Selection using penalised regression

In [18]:
!pip install autofeat

Collecting autofeat
  Downloading https://files.pythonhosted.org/packages/d0/64/c2945c9af39530b09275a0a825c1fe532eca133a673c1768274eff5899a6/autofeat-1.0.0-py3-none-any.whl
Collecting pint
  Downloading https://files.pythonhosted.org/packages/9d/db/7a2204b03c22069839958df5723eb2718d50c33052e0da84c9a83de14ea4/Pint-0.11-py2.py3-none-any.whl (186kB)
Installing collected packages: pint, autofeat
Successfully installed autofeat-1.0.0 pint-0.11


In [31]:
import os
import sys
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from autofeat import AutoFeatClassifier

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
from sklearn.datasets import load_wine
from dabl import plot
from dabl.utils import data_df_from_bunch

In [26]:
datasets = ["wine"]

In [28]:
def load_classification_dataset(name):
    
    wine_bunch = load_wine()
    wine_df = data_df_from_bunch(wine_bunch)
    
    # load one of the datasets as X and y (and possibly units)
    units = {}
    
    if name == "wine":
        # sklearn wine dataset
        X, y = load_wine(True)
        
    else:
       raise RuntimeError("Unknown dataset %r" % name)
                   
    return np.array(X, dtype=float), np.array(y, dtype=float), units

In [44]:
def test_model(dataset, model, param_grid):
    
    # load data
    X, y, _ = load_classification_dataset(dataset)
    
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    
    # DA change to standardize if not RF rather than just for SVC
    
    if not (model.__class__.__name__ == "rforest"):
        sscaler = StandardScaler()
        X_train = sscaler.fit_transform(X_train)
        X_test = sscaler.transform(X_test)
    
    # train model on train split incl cross-validation for parameter selection
    gsmodel = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5, iid=False)
    gsmodel.fit(X_train, y_train)
    
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("Acc. on training data:", accuracy_score(y_train, gsmodel.predict(X_train)))
    print("Acc. on test data:", accuracy_score(y_test, gsmodel.predict(X_test)))
    
    return gsmodel.best_estimator_

In [45]:
def test_autofeat(dataset, feateng_steps=2):
    
    # load data
    X, y, units = load_classification_dataset(dataset)
    
    # split in training and test parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    
    # run autofeat
    afreg = AutoFeatClassifier(verbose=1, feateng_steps=feateng_steps, units=units)
    
    # fit autofeat on less data, otherwise reg model with xval will overfit on new features
    X_train_tr = afreg.fit_transform(X_train, y_train)
    X_test_tr = afreg.transform(X_test)
    
    print("autofeat new features:", len(afreg.new_feat_cols_))
    print("autofeat Acc. on training data:", accuracy_score(y_train, afreg.predict(X_train_tr)))
    print("autofeat Acc. on test data:", accuracy_score(y_test, afreg.predict(X_test_tr)))
          
    # train logistic regression on transformed train split incl cross-validation for parameter selection
        
    print("# Logistic Regression")
    lreg = LogisticRegression(class_weight="balanced")
    param_grid = {"C": np.logspace(-4, 4, 10)}
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gsmodel = GridSearchCV(lreg, param_grid, cv=5)
        gsmodel.fit(X_train_tr, y_train)        
    
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("Acc. on training data:", accuracy_score(y_train, gsmodel.predict(X_train_tr)))
    print("Acc. on test data:", accuracy_score(y_test, gsmodel.predict(X_test_tr)))
    
     # train random forest on transformed train split incl cross-validation for parameter selection
    
    print("# Random Forest")
    rforest = RandomForestRegressor(n_estimators=100, random_state=13)
    param_grid = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    gsmodel = GridSearchCV(rforest, param_grid, scoring='neg_mean_squared_error', cv=5, iid=False)
    gsmodel.fit(X_train_tr, y_train)
    
    print("best params:", gsmodel.best_params_)
    print("best score:", gsmodel.best_score_)
    print("Acc. on training data:", accuracy_score(y_train, gsmodel.predict(X_train_tr)))
    print("Acc. on test data:", accuracy_score(y_test, gsmodel.predict(X_test_tr)))

In [29]:
for dsname in datasets:
    print("####", dsname)
    X, y, _ = load_classification_dataset(dsname)
    print(X.shape)

#### wine
(178, 13)


In [37]:
for dsname in datasets:
    print("####", dsname)
    lreg = LogisticRegression(class_weight="balanced")
    params = {"C": np.logspace(-4, 4, 10)}
    lreg = test_model(dsname, lreg, params)

#### wine
best params: {'C': 0.3593813663804626}
best score: -0.02142857142857143
MSE on training data: 0.0
MSE on test data: 0.027777777777777776
R^2 on training data: 1.0
R^2 on test data: 0.9595959595959596




In [38]:
for dsname in datasets:
    print("####", dsname)
    svc = SVC(gamma="scale", class_weight="balanced")
    params = {"C": [1., 10., 25., 50., 100., 250.]}
    svc = test_model(dsname, svc, params)

#### wine
best params: {'C': 10.0}
best score: -0.02142857142857143
MSE on training data: 0.0
MSE on test data: 0.027777777777777776
R^2 on training data: 1.0
R^2 on test data: 0.9595959595959596




In [39]:
for dsname in datasets:
    print("####", dsname)
    rforest = RandomForestRegressor(n_estimators=100, random_state=13)
    params = {"min_samples_leaf": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2]}
    rforest = test_model(dsname, rforest, params)

#### wine




best params: {'min_samples_leaf': 0.0001}
best score: -0.04227036945812808
MSE on training data: 0.005019014084507043
MSE on test data: 0.05403333333333333
R^2 on training data: 0.9912605008635579
R^2 on test data: 0.9214060606060607


In [42]:
for dsname in datasets:
    print("####", dsname)
    test_autofeat(dsname, feateng_steps=1)

#### wine
[AutoFeat] The 1 step feature engineering process could generate up to 91 features.
[AutoFeat] With 142 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 73 transformed features from 13 original features - done.
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 17 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 10 features after 5 feature selection runs
[featsel] 10 features after noise filtering
[AutoFeat] Computing 3 new features.
[AutoFeat]     3/    3 new features ...done.
[AutoFeat] Final dataframe with 16 feature columns (3 new).
[AutoFeat] Training final classification model.
[AutoFeat] Trained model: largest coefficients:
[-



best params: {'min_samples_leaf': 0.0001}
best score: -0.041212487684729064
MSE on training data: 0.0048429577464788745
MSE on test data: 0.05978888888888889
R^2 on training data: 0.9915670639032815
R^2 on test data: 0.9130343434343434


In [None]:
for dsname in datasets:
    print("####", dsname)
    test_autofeat(dsname, feateng_steps=2)

In [None]:
# use feature selector method to extract features...