documentation: https://github.com/microsoft/FLAML

### Set the logging level
You can configure the logging level to suppress unnecessary outputs to keep the logs cleaner.

In [10]:
import logging
import warnings
 
logging.getLogger('synapse.ml').setLevel(logging.CRITICAL)
logging.getLogger('mlflow.utils').setLevel(logging.CRITICAL)
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=UserWarning)

### Set up MLflow experiment tracking
<!-- MLflow is an open source platform that is deeply integrated into the Data Science experience in Fabric and allows to easily track and compare the performance of different models and experiments without the need for manual tracking. -->

In [13]:


import mlflow

# Set the MLflow experiment to "automl-tutorial" and enable automatic logging
mlflow.set_experiment("automl-tutorial")

2024/10/04 17:41:45 INFO mlflow.tracking.fluent: Experiment with name 'automl-tutorial' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/derne/OneDrive%20-%20The%20Pennsylvania%20State%20University/Programming/Git/AutoML%20App/AutoML%20tut/Flaml/mlruns/348856216409950599', creation_time=1728078105707, experiment_id='348856216409950599', last_update_time=1728078105707, lifecycle_stage='active', name='automl-tutorial', tags={}>

Loading datam

In [1]:
# load data
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv')
X = df.drop('logS', axis=1)
y = df['logS']
df.head()



Unnamed: 0,MolLogP,MolWt,NumRotatableBonds,AromaticProportion,logS
0,2.5954,167.85,0.0,0.0,-2.18
1,2.3765,133.405,0.0,0.0,-2.0
2,2.5938,167.85,1.0,0.0,-1.74
3,2.0289,133.405,1.0,0.0,-1.48
4,2.9189,187.375,1.0,0.0,-3.04


Data splitting

In [2]:
from sklearn.model_selection import train_test_split

# Assuming you have your features in X and target variable in y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
X_train, X_test, y_train, y_test 

(      MolLogP    MolWt  NumRotatableBonds  AromaticProportion
 643   3.61260  296.410                0.0            0.272727
 722   6.48760  226.448               13.0            0.000000
 570   3.15380  284.746                1.0            0.600000
 66    0.77880   74.123                2.0            0.000000
 1002  1.82140  102.177                3.0            0.000000
 ...       ...      ...                ...                 ...
 1044  2.82960  253.305                1.0            0.631579
 1095  2.10750  218.322                3.0            0.000000
 1130  4.14820  335.282                7.0            0.260870
 860   4.25720  275.179                4.0            0.352941
 1126  6.25676  368.369                6.0            0.692308
 
 [915 rows x 4 columns],
       MolLogP    MolWt  NumRotatableBonds  AromaticProportion
 218   1.55740  102.177                3.0            0.000000
 809   5.20590  345.653                4.0            0.571429
 501   4.18090  339.218     

Building a Classification Model with Flaml

In [6]:
from flaml import AutoML
automl = AutoML()
# automl.fit(X_train, y_train, task="classification")

In [14]:
settings = {
    "time_budget": 120,  # total running time in seconds
    "metric": 'accuracy',  # check the documentation for options of metrics (https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML#optimization-metric)
    "task": 'classification',  # task type
    "seed": 42,    # random seed
}
    

In [15]:
'''The main flaml automl API'''
# with mlflow.start_run(run_name="flight_delays_baseline"):
automl.fit(X_train=X_train, y_train=y_train, **settings)
     

[flaml.automl.logger: 10-04 17:42:29] {1728} INFO - task = classification
[flaml.automl.logger: 10-04 17:42:29] {1739} INFO - Evaluation method: cv


[flaml.automl.logger: 10-04 17:42:35] {1838} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 10-04 17:42:35] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'lrl1']
[flaml.automl.logger: 10-04 17:42:35] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 10-04 17:42:38] {2393} INFO - Estimated sufficient time budget=34389s. Estimated necessary time budget=796s.
[flaml.automl.logger: 10-04 17:42:38] {2442} INFO -  at 9.5s,	estimator lgbm's best error=0.2451,	best estimator lgbm's best error=0.2451
[flaml.automl.logger: 10-04 17:42:38] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 10-04 17:42:42] {2442} INFO -  at 13.9s,	estimator lgbm's best error=0.2451,	best estimator lgbm's best error=0.2451
[flaml.automl.logger: 10-04 17:42:42] {2258} INFO - iteration 2, current learner sgd
[flaml.automl.logger: 10-04 17:43:45] {2442} INFO -  at 76.5s,	estimator sgd's best err

Best model and metric

In [16]:
'''retrieve best config and best learner'''
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best ML leaner: lgbm
Best hyperparmeter config: {'n_estimators': 49, 'num_leaves': 4, 'min_child_samples': 12, 'learning_rate': 0.08570310532994721, 'log_max_bin': 6, 'colsample_bytree': 0.8871559629536413, 'reg_alpha': 0.02960826033957992, 'reg_lambda': 0.512318588788703}
Best accuracy on validation data: 0.8555
Training duration of best run: 9.012 s


Model saving and prediction

In [17]:
model_path = f"runs:/{automl.best_run_id}/model"

# Register the model to the MLflow registry
registered_model = mlflow.register_model(model_uri=model_path, name="flight_delays_baseline")

# Print the registered model's name and version
print(f"Model '{registered_model.name}' version {registered_model.version} registered successfully.")
     

Successfully registered model 'flight_delays_baseline'.


MlflowException: Run 'None' not found

Predict with saved model

In [18]:
loaded_model = mlflow.sklearn.load_model(f"models:/{registered_model.name}/{registered_model.version}")

y_pred = loaded_model.predict(X_test)
print('Predicted labels', y_pred)
print('True labels', y_test)
y_pred_proba = automl.predict_proba(X_test)[:,1]

NameError: name 'registered_model' is not defined

In [None]:
''' compute different metric values on testing dataset'''
from flaml.ml import sklearn_metric_loss_score
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test.astype(float)))
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test.astype(float)))
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test.astype(float)))
     

In [5]:
#  restrict the learners and use FLAML as a fast hyperparameter tuning tool for XGBoost, LightGBM, Random Forest etc
automl.fit(X_train, y_train, task="classification", estimator_list=["lgbm"])

[flaml.automl.logger: 10-04 17:15:55] {1728} INFO - task = classification
[flaml.automl.logger: 10-04 17:15:55] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 10-04 17:16:02] {1838} INFO - Minimizing error metric: log_loss
[flaml.automl.logger: 10-04 17:16:02] {1955} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl.logger: 10-04 17:16:34] {2685} INFO - retrain lgbm for 31.6s
[flaml.automl.logger: 10-04 17:16:34] {2688} INFO - retrained model: LGBMClassifier(colsample_bytree=0.6103565306428956,
               learning_rate=0.10182098014295998, max_bin=31,
               min_child_samples=21, n_estimators=1, n_jobs=-1, num_leaves=225,
               reg_alpha=0.0009765625, reg_lambda=40.413729576022625,
               verbose=-1)
[flaml.automl.logger: 10-04 17:16:34] {1985} INFO - fit succeeded
[flaml.automl.logger: 10-04 17:16:34] {1986} INFO - Time taken to find the best model: 0


In [None]:
# run generic hyperparameter tuning for a custom function.

from flaml import tune
tune.run(evaluation_function, config={…}, low_cost_partial_config={…}, time_budget_s=3600)