In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

import mlflow


* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
# run mlflow ui in working directory

In [3]:
mlflow.set_experiment("IN-STK 5000")

2023/10/02 13:32:16 INFO mlflow.tracking.fluent: Experiment with name 'IN-STK 5000' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/DIRH/code/uio/IN-STK5000-Autumn-2023-Materials/nbhome/mlruns/288397854828014395', creation_time=1696246336696, experiment_id='288397854828014395', last_update_time=1696246336696, lifecycle_stage='active', name='IN-STK 5000', tags={}>

In [4]:
digits = load_digits()

In [5]:
Xtrain, Xtest, ytrain, ytest = train_test_split(digits['data'], digits['target'], test_size=20)

In [6]:
mlflow.autolog()

2023/10/02 13:34:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [7]:
# now is the point of no return, mlflow will stop logging if we go back

In [8]:
with mlflow.start_run():
    forest = RandomForestClassifier().fit(Xtrain, ytrain)

In [10]:
def log_accuracy(model):
    y_pred = model.predict(Xtest)
    mlflow.log_metric('test_accuracy', accuracy_score(ytest, y_pred))

In [11]:
with mlflow.start_run():
    forest = RandomForestClassifier().fit(Xtrain, ytrain)
    log_accuracy(forest)

In [12]:
with mlflow.start_run():
    gbc = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3)).fit(Xtrain, ytrain)
    log_accuracy(gbc)

In [13]:
for n_est in (20, 50, 100, 150):
    with mlflow.start_run():
        forest = RandomForestClassifier(n_estimators=n_est).fit(Xtrain, ytrain)
        log_accuracy(forest)

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif

In [15]:
with mlflow.start_run():
    model = Pipeline([
        ("variance_th", VarianceThreshold()),
        ("k_best", SelectKBest()),
        ("random_forest", RandomForestClassifier())])
    param = {'random_forest__max_depth': (20, 30, 40)}
    cv = GridSearchCV(model, param)
    cv.fit(Xtrain, ytrain)

2023/10/02 13:56:29 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


In [16]:
with mlflow.start_run():
    model = Pipeline([
        ("variance_th", VarianceThreshold()),
        ("k_best", SelectKBest()),
        ("random_forest", RandomForestClassifier())])
    param = {'random_forest__n_estimators': (20, 30, 40)}
    cv = GridSearchCV(model, param)
    cv.fit(Xtrain, ytrain)

2023/10/02 13:57:36 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
