# Auto-ML using FLAML

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from ucimlrepo import fetch_ucirepo
from sklearn.datasets import load_digits, load_breast_cancer

from ydata_profiling import ProfileReport
from flaml import AutoML, automl
from flaml.automl.data import get_output_from_log
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score

from helpers.base_imports import *

## Gather datasets

In [3]:
# Dictionary of datasets: name -> (features, target)
datasets = {}

# UCI datasets
for name, id in [("abalone", 1), ("iris", 53), ("wine", 109)]:
    data = fetch_ucirepo(id=id).data
    df = pd.concat([data.features, data.targets], axis=1)
    datasets[name] = df

# sklearn.datasets: Digits
digits = load_digits(as_frame=True)
df_digits = digits.frame  # Already includes 'target' column
datasets["digits"] = df_digits

# sklearn.datasets: Breast Cancer
bc = load_breast_cancer(as_frame=True)
df_bc = bc.frame  # Already includes 'target' column
datasets["breast_cancer"] = df_bc

## Setup FLAML run

Configurable parameters are task (classification/regression), metric (accuracy, f1, etc.), and time limit/budget (in seconds), estimator_list (rf, xgboost, etc), log_file (to save the results), and n_splits (number of folds for cross-validation). The default is 5-fold cross-validation.

In [8]:
def run_flaml_automl(
    df, target_column, task="classification", time_budget=15, dataset_name="dataset"
):
    # 1. Split into features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # 2. Train/test split (80/20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    # 3. Create FLAML AutoML instance
    automl = AutoML()

    log_path = f"data/{dataset_name}_flaml.log"

    # 4. Fit AutoML model with configuration
    automl.fit(
        X_train=X_train,
        y_train=y_train,
        task=task,
        time_budget=time_budget,
        log_file_name=log_path,  # Enable logging so we can extract val_loss
    )

    # 5. Evaluate on test set
    y_pred = automl.predict(X_test)
    if task == "classification":
        score = accuracy_score(y_test, y_pred)
        print(f"[{dataset_name}] Accuracy: {score:.4f}")
    elif task == "regression":
        score = r2_score(y_test, y_pred)
        print(f"[{dataset_name}] R² Score: {score:.4f}")

    print(f"[{dataset_name}] Best estimator: {automl.best_estimator}")
    print(f"[{dataset_name}] Best config: {automl.best_config}")
    print(f"[{dataset_name}] Best validation loss: {automl.best_loss:.4f}")

    rows = []
    with open(log_path, "r") as f:
        for line in f:
            rec = json.loads(line)
            rows.append(
                {
                    "iteration": rec.get("record_id"),
                    "val_loss": rec.get("validation_loss"),
                    "learner": rec.get("learner", "unknown"),
                    "runtime": rec.get("trial_time"),  # or .get("wall_clock_time")
                    "sample_size": rec.get("sample_size"),
                    "hyperparams": rec.get("config", {}),
                }
            )

    df_results = (
        pd.DataFrame(rows)
        .sort_values("val_loss")
        .loc[
            :,
            [
                "iteration",
                "val_loss",
                "learner",
                "runtime",
                "sample_size",
                "hyperparams",
            ],
        ]
    )

    # 7. Save
    df_results.to_csv(f"data/automl_{dataset_name}.csv", index=False)
    df_results.to_html(f"data/automl_{dataset_name}.html", index=False)

    print(
        f"[{dataset_name}] Trials with val_loss, learner, runtime, sample_size and hyperparams saved."
    )
    return df_results

### Run on datasets

In [9]:
# Run FLAML on abalone dataset

# based on the eda the abalone dataset is best suited for regression since the target column is continuous
abalone_res = run_flaml_automl(
    df=datasets["abalone"],
    target_column="Rings",
    task="regression",
    time_budget=15,
    dataset_name="abalone",
)
abalone_res

[flaml.automl.logger: 05-07 17:17:22] {1728} INFO - task = regression
[flaml.automl.logger: 05-07 17:17:22] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 05-07 17:17:22] {1838} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 05-07 17:17:22] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd']
[flaml.automl.logger: 05-07 17:17:22] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 05-07 17:17:22] {2393} INFO - Estimated sufficient time budget=416s. Estimated necessary time budget=3s.
[flaml.automl.logger: 05-07 17:17:22] {2442} INFO -  at 0.1s,	estimator lgbm's best error=0.7882,	best estimator lgbm's best error=0.7882
[flaml.automl.logger: 05-07 17:17:22] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 05-07 17:17:22] {2442} INFO -  at 0.1s,	estimator lgbm's best error=0.7882,	best estimator lgbm's best error=0.7882
[flaml.automl.logger: 05-07 17:17:22] {2258} IN

Unnamed: 0,iteration,val_loss,learner,runtime,sample_size,hyperparams
10,10.0,0.445999,rf,0.281269,3132.0,"{'n_estimators': 34, 'max_features': 0.6599465..."
9,9.0,0.450047,lgbm,0.342309,3132.0,"{'n_estimators': 79, 'num_leaves': 4, 'min_chi..."
8,8.0,0.451225,lgbm,0.654422,3132.0,"{'n_estimators': 54, 'num_leaves': 13, 'min_ch..."
7,7.0,0.455054,lgbm,0.113279,3132.0,"{'n_estimators': 21, 'num_leaves': 4, 'min_chi..."
6,6.0,0.470249,rf,0.196618,3132.0,"{'n_estimators': 6, 'max_features': 0.77544484..."
5,5.0,0.484117,rf,0.189885,3132.0,"{'n_estimators': 7, 'max_features': 0.83996488..."
4,4.0,0.494909,lgbm,0.069021,3132.0,"{'n_estimators': 9, 'num_leaves': 4, 'min_chil..."
3,3.0,0.618633,rf,0.174744,3132.0,"{'n_estimators': 4, 'max_features': 1.0, 'max_..."
2,2.0,0.629099,lgbm,0.039539,3132.0,"{'n_estimators': 4, 'num_leaves': 4, 'min_chil..."
1,1.0,0.706939,sgd,0.244366,3132.0,"{'penalty': 'l2', 'alpha': 0.0001, 'l1_ratio':..."


In [10]:
# Run FLAML on breast cancer dataset

# based on the eda the breast cancer dataset is best suited for classification since the target column is categorical

bc_results = run_flaml_automl(
    df=datasets["breast_cancer"],
    target_column="target",
    task="classification",
    time_budget=15,
    dataset_name="breast_cancer",
)
bc_results

[flaml.automl.logger: 05-07 17:18:28] {1728} INFO - task = classification
[flaml.automl.logger: 05-07 17:18:28] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 05-07 17:18:28] {1838} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 05-07 17:18:28] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'lrl1']
[flaml.automl.logger: 05-07 17:18:28] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 05-07 17:18:28] {2393} INFO - Estimated sufficient time budget=399s. Estimated necessary time budget=9s.
[flaml.automl.logger: 05-07 17:18:28] {2442} INFO -  at 0.1s,	estimator lgbm's best error=0.0361,	best estimator lgbm's best error=0.0361
[flaml.automl.logger: 05-07 17:18:28] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 05-07 17:18:28] {2442} INFO -  at 0.1s,	estimator lgbm's best error=0.0262,	best estimator lgbm's best error=0.0262
[flaml.automl.logger: 05-07 17

Unnamed: 0,iteration,val_loss,learner,runtime,sample_size,hyperparams
7,7.0,0.006834,lgbm,0.103078,426.0,"{'n_estimators': 11, 'num_leaves': 8, 'min_chi..."
6,6.0,0.007835,xgb_limitdepth,0.148194,426.0,"{'n_estimators': 12, 'max_depth': 6, 'min_chil..."
5,5.0,0.010476,xgboost,0.075668,426.0,"{'n_estimators': 4, 'max_leaves': 4, 'min_chil..."
4,4.0,0.01377,xgboost,0.075475,426.0,"{'n_estimators': 4, 'max_leaves': 4, 'min_chil..."
3,3.0,0.019136,lgbm,0.041527,426.0,"{'n_estimators': 4, 'num_leaves': 4, 'min_chil..."
2,2.0,0.01918,lgbm,0.045569,426.0,"{'n_estimators': 7, 'num_leaves': 4, 'min_chil..."
1,1.0,0.026202,lgbm,0.036277,426.0,"{'n_estimators': 4, 'num_leaves': 4, 'min_chil..."
0,0.0,0.036109,lgbm,0.03942,426.0,"{'n_estimators': 4, 'num_leaves': 4, 'min_chil..."
8,,,unknown,,,{}
