In [0]:
#!pip install --upgrade mlflow
#!pip install lightgbm
#!pip install databricks-cli --upgrade
import os
import shutil

import mlflow
import mlflow.sklearn

import numpy as np
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score,GridSearchCV

mlflow.set_tracking_uri("databricks")

In [0]:
mlflow.set_experiment("/Users/your_mail_address/demo")

Out[33]: <Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/3545320327633234', creation_time=1686156183293, experiment_id='3545320327633234', last_update_time=1686390690111, lifecycle_stage='active', name='/Users/gamzakman@gmail.com/demo', tags={'mlflow.experiment.sourceName': '/Users/gamzakman@gmail.com/demo',
 'mlflow.experimentType': 'NOTEBOOK',
 'mlflow.ownerEmail': 'gamzakman@gmail.com',
 'mlflow.ownerId': '980833599660639'}>

In [0]:
# File location and type
from pyspark.sql import functions as F
file_location = "/user/hive/warehouse/demo_model_variable"
file_type = "parquet"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

day,distance,hour,temp,clouds,pressure,humidity,wind,rain,price
6.0,0.44,9,38.46,0.29,1022.25,0.76,7.68,0.0,5.0
0.0,1.08,6,44.85,0.89,1000.7,0.95,1.53,0.0,3.0
4.0,0.72,2,36.31,0.0,1012.37,0.68,6.85,0.0,3.5
1.0,3.24,3,44.18,0.99,1001.5,0.89,12.4,0.02575,3.5
1.0,3.24,3,44.25,1.0,1000.81,0.9,13.7,0.02575,3.5
2.0,1.76,10,33.13,0.14,991.19,0.84,5.66,0.0,5.0
2.0,1.76,10,32.7,0.16,991.4,0.85,6.39,0.0,5.0
2.0,1.76,10,33.17,0.13,991.18,0.84,5.6,0.0,5.0
2.0,1.76,10,33.2,0.13,991.17,0.84,5.56,0.0,5.0
2.0,1.76,10,33.2,0.13,991.17,0.84,5.56,0.0,5.0


In [0]:
merged_df = df.toPandas()

X = merged_df.drop('price', axis=1)
y = merged_df['price']

In [0]:
#LGBM MODEL
mlflow.sklearn.autolog()
learning_rate_ = 0.05
n_estimators_ = 200


with mlflow.start_run(run_name='LgbmModel'):
    lgbm_model = LGBMRegressor(random_state=46)
    lgbm_model = lgbm_model.set_params(learning_rate=learning_rate_, n_estimators=n_estimators_).fit(X, y)
    rmse = np.mean(np.sqrt(-cross_val_score(lgbm_model, X, y, cv=5, scoring='neg_mean_squared_error')))
    mlflow.log_param("learning_rate", learning_rate_)
    mlflow.log_param("n_estimators", n_estimators_)
    mlflow.log_metric('RMSE', rmse)    
    
    if not os.path.exists("outputs"):
        os.makedirs("outputs")

    # log artifacts
    mlflow.log_artifacts("outputs")
    shutil.rmtree('outputs')
   
    mlflow.sklearn.log_model(lgbm_model, 'LgbmModel')





In [0]:
#RANDOMFOREST MODEL
mlflow.sklearn.autolog()
n_estimators_ = 300
max_depth_ = 8


with mlflow.start_run(run_name='RandomForestModel'):
    rf_model = RandomForestRegressor(random_state=46)
    rf_model = rf_model.set_params(n_estimators=n_estimators_, max_depth=max_depth_).fit(X, y)
    rmse = np.mean(np.sqrt(-cross_val_score(rf_model, X, y, cv=5, scoring='neg_mean_squared_error')))
    
    mlflow.log_param("n_estimators", n_estimators_)
    mlflow.log_param("max_depth", max_depth_)
    mlflow.log_metric('RMSE', rmse)    
    
    if not os.path.exists("outputs"):
        os.makedirs("outputs")

    # log artifacts
    mlflow.log_artifacts("outputs")
    shutil.rmtree('outputs')
   
    mlflow.sklearn.log_model(rf_model, 'RandomForestModel')



