In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import mlflow
import mlflow.sklearn


mlflow.set_tracking_uri('http://54.195.173.154:5000/')
mlflow.set_experiment('mlflow_dvc')

X = pd.read_csv('features.csv', usecols=['f1','f2','f3','f4'])
Y = pd.read_csv('label.csv', usecols=['label'])

with mlflow.start_run(run_name="experiment") as run: 
    # tracking run parameters
    mlflow.log_param("compute", 'local')
    mlflow.log_param("dataset", 'example')
    mlflow.log_param("dataset_version", '2.0')
    mlflow.log_param("dataset_path", 's3://score-journey-boluo/data/')
    mlflow.log_param("algo", 'random forest example')
    
    # tracking any additional hyperparameters for reproducibility
    n_estimators = 5
    mlflow.log_param("n_estimators", n_estimators)

    # train the model
    rf = RandomForestRegressor(n_estimators=n_estimators)
    rf.fit(X, Y)
    Y_pred = rf.predict(X)

    # automatically save the model artifact to the S3 bucket for later deployment
    mlflow.sklearn.log_model(rf, "rf-baseline-model")

    # log model performance using any metric
    mse = mean_squared_error(Y, Y_pred)
    mlflow.log_metric("mse", mse)
    
    mlflow.end_run()

2022/11/25 15:55:47 INFO mlflow.tracking.fluent: Experiment with name 'mlflow_dvc' does not exist. Creating a new experiment.
  rf.fit(X, Y)


In [2]:
!pip install mlflow


Collecting mlflow
  Downloading mlflow-2.0.1-py3-none-any.whl (16.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting querystring-parser<2
  Using cached querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting pyarrow<11,>=4.0.0
  Downloading pyarrow-10.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.9/35.9 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting shap<1,>=0.40
  Downloading shap-0.41.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (572 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m572.4/572.4 kB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
Collecting sqlparse<1,>=0.4.0
  Using cached sqlparse-0.4.3-py3-none-any.whl (42 kB)
Collecting docker<7,>=4.0.0
  Using cached docker-6.0.1-py3-none-any.whl (147 kB)
Collecting 

Successfully installed Mako-1.2.4 alembic-1.8.1 databricks-cli-0.17.3 docker-6.0.1 gunicorn-20.1.0 mlflow-2.0.1 oauthlib-3.2.2 protobuf-4.21.9 pyarrow-10.0.1 querystring-parser-1.2.4 shap-0.41.0 slicer-0.0.7 sqlparse-0.4.3
