In [1]:
import mlflow
from mlflow.models.signature import infer_signature

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from catboost import CatBoostRegressor

from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
# Use this if you run mlflow locally
mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Use this if you have deployed MLFlow server using docker-compose
# mlflow.set_tracking_uri("http://localhost:7777")
mlflow.set_experiment("yield_prediction")

<Experiment: artifact_location=('/Users/mufin/Documents/data-science-collection/Topic 12 - '
 'MLFlow/notebooks/mlruns/1'), creation_time=1684586236073, experiment_id='1', last_update_time=1684586236073, lifecycle_stage='active', name='yield_prediction', tags={}>

In [3]:
# Loading and preprocessing data
train = pd.read_csv('../data/train.csv')
train = train.drop(['id'], axis=1)
train["fruit_seed"] = train["fruitset"] * train["seeds"]
train.head()

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield,fruit_seed
0,25.0,0.5,0.25,0.75,0.5,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.425011,0.417545,32.460887,4476.81146,13.796231
1,25.0,0.5,0.25,0.5,0.5,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.444908,0.422051,33.858317,5548.12201,15.063846
2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.552927,0.470853,38.341781,6869.7776,21.200199
3,12.5,0.25,0.25,0.63,0.5,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.565976,0.478137,39.467561,6880.7759,22.337712
4,25.0,0.5,0.25,0.63,0.63,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.579677,0.494165,40.484512,7479.93417,23.467926


In [4]:
cv = KFold(n_splits=10, random_state=42, shuffle=True)
X = train.copy()
y = X.pop('yield')

# sc = StandardScaler()
# X = sc.fit_transform(X)
# test = sc.transform(test)

unique_targets = np.sort(y.unique())

In [5]:
def find_closest_value(sorted_list, values):
    indices = np.searchsorted(sorted_list, values, side='left')
    closest_values = np.zeros_like(values)
    
    for i, index in enumerate(indices):
        if index == 0:
            closest_values[i] = sorted_list[0]
        elif index == len(sorted_list):
            closest_values[i] = sorted_list[-1]
        else:
            left_diff = values[i] - sorted_list[index-1]
            right_diff = sorted_list[index] - values[i]
            
            if left_diff < right_diff:
                closest_values[i] = sorted_list[index-1]
            else:
                closest_values[i] = sorted_list[index]
    
    return closest_values

In [6]:
def cross_val_score(model, unique_targets=None, cv = cv, label = ''):
    
    X = train.copy()
    y = X.pop('yield')
    
    #initiate prediction arrays and score lists
    val_predictions = np.zeros((len(train)))
    train_predictions = np.zeros((len(train)))
    train_mae, val_mae = [], []
    
    #training model, predicting prognosis probability, and evaluating log loss
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        
        model.fit(X.iloc[train_idx], y.iloc[train_idx])

        train_preds = model.predict(X.iloc[train_idx])
        val_preds = model.predict(X.iloc[val_idx])
        
        if unique_targets is not None:
            train_preds = find_closest_value(unique_targets, train_preds)
            val_preds = find_closest_value(unique_targets, val_preds)
                  
        train_predictions[train_idx] += train_preds
        val_predictions[val_idx] += val_preds
        
        train_score = mean_absolute_error(y.iloc[train_idx], train_preds)
        val_score = mean_absolute_error(y.iloc[val_idx], val_preds)
        
        train_mae.append(train_score)
        val_mae.append(val_score)
    
    print(f'Val MAE: {np.mean(val_mae):.5f} ± {np.std(val_mae):.5f} | Train MAE: {np.mean(train_mae):.5f} ± {np.std(train_mae):.5f} | {label}')
    
    return np.mean(val_mae), np.std(val_mae), np.mean(train_mae), np.std(train_mae)

In [7]:
signature = infer_signature(train.drop('yield', axis=1), train['yield'])

In [8]:
for max_depth in [5, 7, 10, 15, None]:
    with mlflow.start_run(run_name=f'rf_max_depth_{max_depth}'):
        rf_params = {
            'n_estimators': 50,
            'max_depth': max_depth,
            'min_samples_leaf': 20,
            'max_features': None,
            'random_state': 42
        }
    
        mlflow.set_tag("model_name", "Random Forest")
        mlflow.log_params(rf_params)
    
        rf = RandomForestRegressor(**rf_params)
        cv_val_mean, cv_val_std, cv_train_mean, cv_train_std = cross_val_score(rf, unique_targets, cv, label='rf')
    
        mlflow.log_metric("val_mae", cv_val_mean)
        mlflow.log_metric("val_std", cv_val_std)
        mlflow.log_metric("train_mean", cv_train_mean)
        mlflow.log_metric("train_std", cv_train_std)
        mlflow.sklearn.log_model(rf, "sk_models", signature=signature)

Val MAE: 357.25408 ± 11.27634 | Train MAE: 351.82351 ± 0.93820 | rf




Val MAE: 352.31000 ± 11.00260 | Train MAE: 338.20675 ± 0.92843 | rf
Val MAE: 352.00899 ± 10.84233 | Train MAE: 324.31472 ± 0.98148 | rf
Val MAE: 352.54551 ± 11.00847 | Train MAE: 318.72795 ± 0.91843 | rf
Val MAE: 352.53632 ± 10.98340 | Train MAE: 318.58366 ± 0.90571 | rf


In [9]:
with mlflow.start_run(run_name='cb'):
    cb_params = {
        'n_estimators': 250,
        'depth': 10,
        'learning_rate': 0.09,
        'random_strength': 0.2,
        'grow_policy': 'Lossguide',
        'bootstrap_type': 'Bayesian',
        'eval_metric': 'MAE',
        'loss_function': 'MAE',
        'random_state': 42,
        'silent': True
    }

    mlflow.set_tag("model_name", "CatBoost")
    mlflow.log_params(cb_params)

    cb = CatBoostRegressor(**cb_params)
    cv_val_mean, cv_val_std, cv_train_mean, cv_train_std = cross_val_score(cb, unique_targets, cv, label='cb')

    mlflow.log_metric("val_mae", cv_val_mean)
    mlflow.log_metric("val_std", cv_val_std)
    mlflow.log_metric("train_mean", cv_train_mean)
    mlflow.log_metric("train_std", cv_train_std)
    mlflow.catboost.log_model(cb, "sk_models", signature=signature)

Val MAE: 341.84799 ± 10.74362 | Train MAE: 307.16137 ± 1.16816 | cb


In [10]:
with mlflow.start_run(run_name='mixed'):
    rf_params = {
        'n_estimators': 50,
        'max_depth': 10,
        'min_samples_leaf': 20,
        'max_features': None,
        'random_state': 42
    }
    
    cb_params = {
        'n_estimators': 250,
        'depth': 10,
        'learning_rate': 0.09,
        'random_strength': 0.2,
        'grow_policy': 'Lossguide',
        'bootstrap_type': 'Bayesian',
        'eval_metric': 'MAE',
        'loss_function': 'MAE',
        'random_state': 42,
        'silent': True
    }

    models = [
        ('rf', RandomForestRegressor(**rf_params)),
        ('cb', CatBoostRegressor(**cb_params)),
    ]
    voter = VotingRegressor(models)

    mlflow.set_tag("model_name", "Mixed")
    cb_params.update(rf_params)
    mlflow.log_params(cb_params)
    cv_val_mean, cv_val_std, cv_train_mean, cv_train_std = cross_val_score(voter, unique_targets, cv, label='cb')

    mlflow.log_metric("val_mae", cv_val_mean)
    mlflow.log_metric("val_std", cv_val_std)
    mlflow.log_metric("train_mean", cv_train_mean)
    mlflow.log_metric("train_std", cv_train_std)
    mlflow.sklearn.log_model(voter, "sk_models", signature=signature)

Val MAE: 343.63727 ± 10.82868 | Train MAE: 313.35962 ± 1.08404 | cb
