In [None]:
from fast_automl.baseline import BaselineRegressor

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
reg = BaselineRegressor().fit(X_train, y_train)
reg.score(X_test, y_test)

In [None]:
from fast_automl.baseline import BaselineClassifier

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
clf = BaselineClassifier().fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from fast_automl.linear_model import ConstrainedLinearRegression

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)
reg = ConstrainedLinearRegression(constraint=.8).fit(X_train, y_train)
print(reg.score(X_test, y_test))
print(reg.coef_.sum())

In [None]:
from fast_automl.linear_model import Ridge

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)
reg = Ridge().fit(X_train, y_train)
reg.score(X_test, y_test)

In [None]:
from fast_automl.utils import ColumnSelector

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

X = pd.DataFrame({
    'x0': [-1, -2, 1, 2],
    'x1': [-1, -1, 1, 1]
})
y = np.array([1, 1, 2, 2])

reg = make_pipeline(
    ColumnSelector(['x1']),
    LinearRegression()
).fit(X, y)
reg.score(X, y)

In [None]:
from fast_automl.utils import ColumnRemover

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

X = pd.DataFrame({
    'x0': [-1, -2, 1, 2],
    'x1': [-1, -1, 1, 1]
})
y = np.array([1, 1, 2, 2])

reg = make_pipeline(
    ColumnRemover(['x0']),
    LinearRegression()
).fit(X, y)
reg.score(X, y)

In [None]:
from fast_automl.cv_estimators import RandomForestClassifierCV

from sklearn.datasets import load_digits
from sklearn.model_selection import cross_val_score, train_test_split

X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True)
clf = RandomForestClassifierCV().fit(X_train, y_train, n_jobs=-1)
print('Cross val score: {:.4f}'.format(cross_val_score(clf.best_estimator_, X_train, y_train).mean()))
print('Test score: {:.4f}'.format(clf.score(X_test, y_test)))

In [None]:
from fast_automl.ensemble import ClassifierWeighter

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold, cross_val_predict, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from copy import deepcopy

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True)

svc = SVC(probability=True).fit(X_train, y_train)
knn = KNeighborsClassifier().fit(X_train, y_train)

cv = StratifiedKFold(random_state=np.random.RandomState(), shuffle=True)
X_meta = np.array([
    cross_val_predict(clf, X_train, y_train, cv=deepcopy(cv), method='predict_proba')
    for clf in (svc, knn)
]).transpose(1, 2, 0)
weighter = ClassifierWeighter().fit(X_meta, y_train)

X_meta_test = np.array([
    clf.predict_proba(X_test) for clf in (svc, knn)
]).transpose(1, 2, 0)
print(weighter.coef_)
print(svc.score(X_test, y_test))
print(knn.score(X_test, y_test))
weighter.score(X_meta_test, y_test)

In [None]:
from fast_automl.ensemble import RFEVotingClassifierCV

from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True)

clf = RFEVotingClassifierCV([
    ('rf', RandomForestClassifier()),
    ('knn', KNeighborsClassifier()),
    ('svm', SVC(probability=True))
]).fit(X_train, y_train)
print('CV score: {:.4f}'.format(cross_val_score(clf.best_estimator_, X_train, y_train).mean()))
print('Test score: {:.4f}'.format(clf.score(X_test, y_test)))

In [None]:
from fast_automl.ensemble import RFEVotingRegressorCV

from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

reg = RFEVotingRegressorCV([
    ('rf', RandomForestRegressor()),
    ('knn', KNeighborsRegressor()),
    ('svm', SVR())
]).fit(X_train, y_train)
print('CV score: {:.4f}'.format(cross_val_score(reg.best_estimator_, X_train, y_train).mean()))
print('Test score: {:.4f}'.format(reg.score(X_test, y_test)))

In [None]:
from fast_automl.ensemble import StepwiseVotingClassifierCV

from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True)

clf = StepwiseVotingClassifierCV([
    ('rf', RandomForestClassifier()),
    ('knn', KNeighborsClassifier()),
    ('svm', SVC(probability=True))
]).fit(X_train, y_train)
print('CV score: {:.4f}'.format(cross_val_score(clf.best_estimator_, X_train, y_train).mean()))
print('Test score: {:.4f}'.format(clf.score(X_test, y_test)))

In [None]:
from fast_automl.ensemble import StepwiseVotingRegressorCV

from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

reg = StepwiseVotingRegressorCV([
    ('rf', RandomForestRegressor()),
    ('knn', KNeighborsRegressor()),
    ('svm', SVR())
]).fit(X_train, y_train)
print('CV score: {:.4f}'.format(cross_val_score(reg.best_estimator_, X_train, y_train).mean()))
print('Test score: {:.4f}'.format(reg.score(X_test, y_test)))

In [None]:
%%time

from fast_automl.automl import AutoClassifier

from sklearn.datasets import load_digits
from sklearn.model_selection import cross_val_score, train_test_split

X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y)

clf = AutoClassifier(ensemble_method='stepwise', n_jobs=-1, verbose=True).fit(X_train, y_train)
print('CV score: {:.4f}'.format(cross_val_score(clf.best_estimator_, X_train, y_train).mean()))
print('Test score: {:.4f}'.format(clf.score(X_test, y_test)))

In [None]:
from fast_automl.metrics import roc_auc_score

roc_auc_score(y_test, clf.predict_proba(X_test))

In [None]:
%%time

from fast_automl.automl import AutoRegressor

from sklearn.datasets import load_diabetes
from sklearn.model_selection import cross_val_score, train_test_split

X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

reg = AutoRegressor(n_jobs=-1, verbose=True).fit(X_train, y_train)
print('CV score: {:.4f}'.format(cross_val_score(reg.best_estimator_, X_train, y_train).mean()))
print('Test score: {:.4f}'.format(reg.score(X_test, y_test)))

In [None]:
from fast_automl.metrics import check_scoring

import numpy as np
import pandas as pd
from scipy.stats import t as t_distribution
from sklearn.base import clone, is_classifier
from sklearn.model_selection import check_cv
from joblib import Parallel, delayed

def _compute_pairwise_diff(estimators, X, y, repetitions, cv, scoring=None, n_jobs=None):
    def fold_score(estimator, train_idx, test_idx):
        if isinstance(X, pd.DataFrame):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        else:
            X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        estimator.fit(X_train, y_train)
        return scoring(estimator, X_test, y_test)
    
    scoring = check_scoring(scoring, classifier=is_classifier(estimators[0]))
    cv = check_cv(cv, y=y, classifier=is_classifier(estimators[0]))
    if hasattr(cv, 'shuffle'):
        cv.shuffle = True
    scores = Parallel(n_jobs=n_jobs)(
        delayed(fold_score)(
            clone(est), train_idx, test_idx
        )
        for _ in range(repetitions)
        for train_idx, test_idx in cv.split(X)
        for name, est in estimators
    )
    scores = np.array(scores).reshape(-1, len(estimators))
    return np.array([
        scores[:,i]-scores[:,j]
        for i in range(len(estimators)) 
        for j in range(i+1, len(estimators))
    ]).T

def _collect_dataframe(estimators, pairwise_diff_mean, pairwise_diff_std, t_stats, p_vals):
    df = pd.DataFrame([
        {
            'Estimator1': estimators[i][0],
            'Estimator2': estimators[j][0]
        }
        for i in range(len(estimators))
        for j in range(i+1, len(estimators))
    ])
    df['PerformanceDifference'] = pairwise_diff_mean
    df['Std'] = pairwise_diff_std
    df['t-stat'] = t_stats
    df['p-value'] = p_vals
    return df

def corrected_repeated_kfold_cv_test(
        estimators, X, y, repetitions=10, cv=10, scoring=None, n_jobs=None
    ):
    # corrected repeated k-fold cv test
    cv = check_cv(cv, y=y, classifier=is_classifier(estimators[0]))
    pairwise_diff = _compute_pairwise_diff(estimators, X, y, repetitions, cv, scoring, n_jobs)
    # Nadeau and Bengio correction
    # https://www.cs.waikato.ac.nz/~eibe/pubs/bouckaert_and_frank.pdf
    pairwise_diff_var = pairwise_diff.var(ddof=1, axis=0)
    k = cv.get_n_splits(X)
    pairwise_diff_var *= 1./(k * repetitions) + 1./(k-1)
    # compute statistics
    pairwise_diff_mean = pairwise_diff.mean(axis=0)
    pairwise_diff_std = np.sqrt(pairwise_diff_var)
    t_stats = pairwise_diff_mean / pairwise_diff_std
    df = k * repetitions - 1
    p_vals = [t_distribution.sf(abs(t_stat), df)*2 for t_stat in t_stats]
    return _collect_dataframe(estimators, pairwise_diff_mean, pairwise_diff_std, t_stats, p_vals)

In [None]:
%%time

from fast_automl.test import corrected_repeated_kfold_cv_test

from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR

X, y = load_boston(return_X_y=True)
corrected_repeated_kfold_cv_test(
    [
        ('rf', RandomForestRegressor()),
        ('rf', RandomForestRegressor()),
        ('ridge', Ridge()),
        ('svm', SVR())
    ],
    X, y, n_jobs=-1
)

In [None]:
%%time

from fast_automl.test import r_by_k_cv_test

from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR

X, y = load_boston(return_X_y=True)
r_by_k_cv_test(
    [
        ('rf', RandomForestRegressor()),
        ('ridge', Ridge()),
        ('svm', SVR())
    ],
    X, y, n_jobs=-1
)

In [3]:
from fast_automl.utils import BoundRegressor

import numpy as np
from sklearn.linear_model import LinearRegression

X_train = np.array([
    [1, 2],
    [7, 8]
])
X_test = np.array([
    [3, 4],
    [5, 1000]
])
y_train = np.array([1.5, 7.5])
y_test = np.array([3.5, 5.5])

reg = LinearRegression().fit(X_train, y_train)
reg.score(X_test, y_test)

-123503.49999999988

In [4]:
reg = BoundRegressor(LinearRegression()).fit(X_train, y_train)
reg.score(X_test, y_test)

-1.0

In [6]:
reg.predict(X_test)

array([3.5, 7.5])