# Description

After you identify a set of tables that may be used as the input for your model, the next step is to train your model.

Databricks has a managed MLflow solution that helps manage the life cycle of your ML models.
MLflow helps you with:
- **Experiment tracking** Run experiments with any ML library, framework or language, and automatically keep track of parameters, metrics, code and models from each experiment.
- **Model management** Use one central place to discover and share ML models, collaborate on moving them from experimentation to online testing and production.
- **Model deployment** - Quickly deploy production models for batch inference on Apache Spark™ or as REST APIs using built-in integrations.

# Boilerplate

## Dependencies

In [0]:
import mlflow
from databricks.feature_engineering import FeatureEngineeringClient, FeatureLookup, FeatureFunction
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

## Parameters

In [0]:
dbutils.widgets.text("catalog_name", "", "00 - Catalog Name")
dbutils.widgets.text("schema_name", "", "01 - Schema Name")

In [0]:
params = dbutils.widgets.getAll()

for key, value in params.items():
  assert value != "", f"Parameter {key} is empty"

locals().update(params)

# Main

## Build Spine

In [0]:
df_spine = spark.sql(f"""
  WITH a1 AS (
    SELECT 
      customer_id
      , billing_month
      , total_invoice
      , LAG(total_invoice) OVER (PARTITION BY customer_id ORDER BY billing_month) lm_invoice
    FROM {catalog_name}.{schema_name}.invoices
  ), a2 AS (
    SELECT
      *
      , (total_invoice - lm_invoice) / lm_invoice AS pct_diff
    FROM a1
  )

  SELECT
    customer_id
    , billing_month
    , CASE
        WHEN pct_diff >= 0.1 THEN 'Increase' 
        WHEN pct_diff <= -0.1 THEN 'Decrease'
        ELSE 'Same'
        END invoice_change
  FROM a2
  WHERE 1=1
    AND billing_month > (SELECT min(billing_month) FROM a2)    
  """)

df_training_spine = spark.sql(
  """
  SELECT
    *
  FROM {df_spine}
  WHERE billing_month < (SELECT max(billing_month) FROM {df_spine})""",
  df_spine=df_spine)

df_scoring_spine = spark.sql(
  """
  SELECT
    *
  FROM {df_spine}
  WHERE billing_month = (SELECT max(billing_month) FROM {df_spine})""",
  df_spine=df_spine)

df_training_spine.limit(10).display()
df_scoring_spine.limit(10).display()

## Build Training Set

In [0]:
fe = FeatureEngineeringClient()

# Define the feature table
feature_table = f"{catalog_name}.{schema_name}.customer_features"

feature_lookups = [
    # retrieve the prematerialized feature
    FeatureLookup(
        table_name = f"{catalog_name}.{schema_name}.customer_weekly_features",
        lookup_key = "customer_id",
        timestamp_lookup_key="billing_month"
    ),
    FeatureLookup(
        table_name = f"{catalog_name}.{schema_name}.customer_demographics",
        lookup_key = "customer_id",
    ),
    FeatureFunction(
        udf_name = f"{catalog_name}.{schema_name}.day_difference",    # UDF must be in Unity Catalog so uses a three-level namespace
        input_bindings = {
            "from_date": "birth_date",
            "to_date": "billing_month"
        },
        output_name="days_old",
    ),
    FeatureFunction(
        udf_name = f"{catalog_name}.{schema_name}.day_difference",    # UDF must be in Unity Catalog so uses a three-level namespace
        input_bindings = {
            "from_date": "customer_since",
            "to_date": "billing_month"
        },
        output_name="days_as_customer",
    ),
]

In [0]:
training_set = fe.create_training_set(
    df=df_spine,
    feature_lookups=feature_lookups,
    label="invoice_change",
    exclude_columns=["birth_date", "customer_since", "customer_id", "billing_month", "location"]
)

training_df = training_set.load_df()
training_pdf = training_df.toPandas()

## Train Model

### Single Model

In [0]:
# Train a random forest classifier
X = training_pdf.drop("invoice_change", axis=1)
y = training_pdf["invoice_change"]

model = RandomForestClassifier()
model.fit(X, y)

### Gird Hyper Parameter Search

In [0]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the RandomForestClassifier
rf = RandomForestClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X, y)

# Get the best parameters
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [0]:
#mlflow.start_run(run_name="RandomForest_GridSearch")
#mlflow.log_params(best_params)
#mlflow.log_metric("accuracy", grid_search.best_score_)
#mlflow.sklearn.log_model(best_model, "model")
#mlflow.end_run()

## Log Model

In [0]:
fe.log_model(
  model=best_model,
  artifact_path="model",
  flavor=mlflow.sklearn,
  training_set=training_set,
  registered_model_name="invoice_increase_model"
)