### Train Model and log performance on test dataset in MLflow

#### Input
* train_X.parquet
* train_Y.parquet
* test_X.parquet
* test_Y.parquet

#### Output
* params
* metrics
* model

In [1]:
import os
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn

from sklearn import tree
from sklearn.metrics import explained_variance_score, mean_squared_error

# params
random_seed = 123
percent_train = 0.8
method = "mean" # {"mean", "median"}

criterion = "mse" # {“mse”, “friedman_mse”, “mae”}
max_depth = 1000

In [2]:
# file and directory info
project_dir = '/Users/chou/Desktop/mlflow_dvc_cookiecutter/DSProjectTemplate'
interim_folder = "/data/interim/"
processed_folder = "/data/processed/"
src_folder = "/src/"

# functions
def eval_metrics(actual, pred):
    exp_var = explained_variance_score(actual, pred)
    mse = mean_squared_error(actual, pred)
    return exp_var, mse

# LoadData
script_path = project_dir + src_folder + "01_LoadData_Python.py" 
os.system("python %s" % (script_path))

# SplitData
script_path = project_dir + src_folder + "02_SplitData_R.R" 
os.system("Rscript %s %f %f" % (script_path, random_seed, percent_train))

# CreateEncoder
script_path = project_dir + src_folder + "03_CreateEncoder_R.R" 
os.system("Rscript %s %s" % (script_path, method))

# Feture
script_path = project_dir + src_folder + "04_Feature_R.R" 
os.system("Rscript %s" % (script_path))

# Train
# input
train_X = pd.read_parquet(project_dir + interim_folder + 'train_X.parquet')
train_Y = pd.read_parquet(project_dir + interim_folder + 'train_Y.parquet')
test_X = pd.read_parquet(project_dir + interim_folder + 'test_X.parquet')
test_Y = pd.read_parquet(project_dir + interim_folder + 'test_Y.parquet')
test_Y = test_Y.iloc[:,0].to_numpy()

model = tree.DecisionTreeRegressor(criterion = criterion, max_depth = max_depth)
model = model.fit(train_X, train_Y)

test_pred = model.predict(test_X)
(exp_var, mse) = eval_metrics(test_Y, test_pred)

print("DecisionTreeRegressor model (criterion=%s, max_depth=%f):" % (criterion, max_depth))
print("  Explained Variance Score : %s" % exp_var)
print("  MSE: %s" % mse)

DecisionTreeRegressor model (criterion=mse, max_depth=1000.000000):
  Explained Variance Score : 0.02820683238069932
  MSE: 0.7612244897959184


In [3]:
(exp_var, mse)

(0.02820683238069932, 0.7612244897959184)