In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

def create_model():
    class PreprocessDF():
        def __init__(self):
            
            # ensure the order and needed columns
            self.needed_columns = [
        "bedrooms",
        "bathrooms",
        "cleaning_fee",
        "available_days",
        "blocked_days",
        "occupancy_rate",
        "reservation_days",
        "adr_usd",
        "number_of_reservation",
        "num_neighbours",
        "dist_from_bc",
    ]
    
        def fit(self, df, y = None):
            return self

        def transform(self, input_df, y = None):
            df = input_df.copy() # creating a copy to avoid changes to original dataset
            # for column in self.encoders:
            #     df[column] = self.encoders[column].transform(df[[column]])
            return df[self.needed_columns].astype('float32')
        
    # it guarantees that model and preprocessing needed are always togheter
    model = Pipeline(steps=[
            ('preprocess', PreprocessDF()),
            ('classifier', RandomForestClassifier())
        ])
    
    search_params = {'classifier__criterion':['gini'], 'classifier__max_depth':[20, 30], 'classifier__n_estimators': [10, 80]}
    # best model with f1, other metrics are only monitored
    clf = GridSearchCV(model, search_params, scoring=['f1', 'accuracy', 'balanced_accuracy', 'precision', 'recall', 'roc_auc'], refit='f1', cv=3)
    return clf

In [5]:
import pandas as pd
training_df = pd.read_parquet("/home/gianmaria/repos/airbnb-bc/src/feature_store/feature_repo/data/training_dataset.parquet")
clf = create_model()

In [6]:
import warnings
import tempfile
import os
import mlflow
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from sklearn.model_selection import GridSearchCV

experiment_name = 'drivers'
existing_exp = mlflow.get_experiment_by_name(experiment_name)
if not existing_exp:
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    experiment_id = existing_exp.experiment_id

timestamp = datetime.now().isoformat().split(".")[0].replace(":", ".")
with mlflow.start_run(experiment_id=experiment_id, run_name=timestamp) as run:
    clf.fit(training_df, training_df['target'])
    cv_results = clf.cv_results_
    best_index = clf.best_index_
    for score_name in [score for score in cv_results if "mean_test" in score]:
        mlflow.log_metric(score_name, cv_results[score_name][best_index])
        mlflow.log_metric(score_name.replace("mean","std"), cv_results[score_name.replace("mean","std")][best_index])

    tempdir = tempfile.TemporaryDirectory().name
    os.mkdir(tempdir)
    filename = "%s-%s-cv_results.csv" % ('RandomForest', timestamp)
    csv = os.path.join(tempdir, filename)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        pd.DataFrame(cv_results).to_csv(csv, index=False)
    
    mlflow.log_artifact(csv, "cv_results")

In [36]:
import pandas as pd
import mlflow
from mlflow.entities import ViewType

def get_best_model(experiment_name, metric = 'mean_test_f1'):
    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    df = mlflow.search_runs(experiment_names=[experiment_name], run_view_type=ViewType.ALL)
    df = df.sort_values(by=[f'metrics.{metric}'], ascending=False)

    for _, row in df.iterrows(): # there are a lot of anonymous experiment created by autologging (probably), i must investigate on this
        try:
            model = mlflow.sklearn.load_model("runs:/" + row['run_id'] + "/best_estimator")
            # if model is not None:
            #     print(f"loaded model = {row['run_id']}")
            #     break
        except:
            pass # not the best solution for sure..., TODO: rewrite this
            print("cannot find the model")

    return df

In [37]:
import mlflow 
client = mlflow.tracking.MlflowClient()
runs = client.search_runs("test1", "", order_by=["metrics.mean_test_f1 DESC"], max_results=1)
best_run = runs[0]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

cannot find the model


Index(['run_id', 'experiment_id', 'status', 'artifact_uri', 'start_time',
       'end_time', 'metrics.mean_test_accuracy', 'metrics.mean_test_roc_auc',
       'metrics.std_test_recall', 'metrics.mean_test_f1',
       'metrics.std_test_precision', 'metrics.mean_test_precision',
       'metrics.mean_test_balanced_accuracy',
       'metrics.std_test_balanced_accuracy', 'metrics.std_test_f1',
       'metrics.mean_test_recall', 'metrics.std_test_accuracy',
       'metrics.std_test_roc_auc', 'tags.mlflow.source.name',
       'tags.mlflow.source.type', 'tags.mlflow.runName', 'tags.mlflow.user',
       'tags.mlflow.source.git.commit'],
      dtype='object')

In [33]:
mlflow.sklearn.load_model("runs:/" + '55e01656e01846dd8dff0a44dcbbfb95' + "/best_estimator")


import mlflow.pyfunc

model_name = "sk-learn-random-forest-clf-model"
model_version = 1

model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [32]:
get_best_model('test1').head()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.mean_test_accuracy,metrics.mean_test_roc_auc,metrics.std_test_recall,metrics.mean_test_f1,...,metrics.std_test_balanced_accuracy,metrics.std_test_f1,metrics.mean_test_recall,metrics.std_test_accuracy,metrics.std_test_roc_auc,tags.mlflow.source.name,tags.mlflow.source.type,tags.mlflow.runName,tags.mlflow.user,tags.mlflow.source.git.commit
0,55e01656e01846dd8dff0a44dcbbfb95,318192827015530035,FINISHED,mlflow-artifacts:/318192827015530035/55e01656e...,2024-05-24 12:16:53.940000+00:00,2024-05-24 12:16:58.058000+00:00,0.968307,0.849688,0.00373,0.983876,...,0.01606,0.001518,0.995582,0.002908,0.041247,/home/gianmaria/repos/airbnb-bc/src/train_mode...,LOCAL,2024-05-24T14.16.53,gianmaria,fc1138078a17591d349fc8ada54cf4401a16a8f3
