In [0]:
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
from pyspark.sql.functions import *


### installing the requirements

In [1]:
!pip install xgboost    

In [11]:
!pip install mlflow 

## Register a model via MLflow

In [1]:
import numpy as np
from xgboost import XGBRegressor
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
import math
from sklearn.metrics import r2_score
from mlflow import MlflowClient


def RRMSE(true, pred):

    num = np.sum(np.square(true - pred))
    den = np.sum(np.square(pred))
    
    squared_error = num/den
    rrmse_loss = np.sqrt(squared_error)
    
    return rrmse_loss



def model_register(data_collector, model_name):
    
    experiment_name="ML_Exp"
   
    mlflow.set_experiment(experiment_name)
     
    with mlflow.start_run(run_name= model_name):
        
        mlflow.autolog()
        
        tsp=TimeSeriesSplit(n_splits=5)
        
        client = MlflowClient()
        
        X_train= data_collector['X_train']
        y_train= data_collector['y_train']
        
        model = XGBRegressor()
        
        parameters = {'depth' : [8,10],'learning_rate' : [0.01, 0.1],'n_estimators': [100, 500, 1000]}
        grid = GridSearchCV(estimator=model, param_grid = parameters, cv = tsp, n_jobs=-1)
        grid.fit(X_train, y_train)
        
        model = XGBRegressor(**grid.best_params_)
        

        model.fit(X_train, y_train)
        train_pred = model.predict(X_train)
        
        autolog_run = mlflow.last_active_run()
        mse=  mean_squared_error(y_train,train_pred)
        rmse= math.sqrt(mse)
        rrmse= RRMSE(y_train,train_pred)
        mae= mean_absolute_error( y_train,train_pred)
        r2= r2_score(y_train,train_pred)
        
        mape= mean_absolute_percentage_error(y_train,train_pred)
        mlflow.log_metric('rmse', rmse)
        mlflow.log_metric('rrmse', rrmse)
        mlflow.log_metric('mape',mape)
        
        mlflow.sklearn.log_model(model, 
                         artifact_path="XGBoost_models/%s"%model_name, 
                          registered_model_name="%s"%model_name)
        
        vf = client.get_latest_versions(model_name, stages=["None"])
        print("The version of {} is {}".format(model_name,vf[0].version))     
        print('model %s has been registered '%model_name, '\n')


        

In [2]:

def prepare_Xy(df):
  
    y=df.pop('target')

    X=df
  

    return X,y


## Get the latest version of the model and the last update date (year-month-day)

In [0]:
from collections import defaultdict
import numpy as np
import time
from mlflow.tracking.client import MlflowClient
from mlflow.entities.model_registry.model_version_status import ModelVersionStatus
import mlflow
from collections import defaultdict
import numpy as np

def fetch_model_last_update(model_name):


    client = MlflowClient() 

    vf= client.get_latest_versions(model_name, stages=["None"])
    
    latest_vesrsion= vf[0].version

    run_id= vf[0].run_id
    run = mlflow.get_run(run_id)
    run.data.tags['mlflow.log-model.history'] 
    
    DT_split= DT.split('":"')[-3][:10]
    REG_DATE=DT_split.split('-')

    print('Model:',model_name, '  last training date:', REG_DATE[0]+'-'+REG_DATE[1]+'-'+REG_DATE[2])
     
    model_last_update= date(int(REG_DATE[0]), int(REG_DATE[1]), int(REG_DATE[2]))

    return model_last_update



## Load the latest version of the model and predict on it

In [10]:
from mlflow import MlflowClient
from datetime import date, datetime
import warnings
warnings.simplefilter(action='ignore')
import time, threading



def test_prediction( df_test, model_name ):


    client = MlflowClient() 

    vf= client.get_latest_versions(group, stages=["None"])
  
    model_test = mlflow.pyfunc.load_model(model_uri="models:/%s/%s"% (model_name, vf[0].version)) 

    
    X_test, y_test= prepare_Xy(df_test)
                      

    y_pred= np.array(model_test.predict(X_test))

    mape= mean_absolute_percentage_error(y_test, y_pred)
    
    r2= r2_score( y_test, y_pred)

    return mape, r2

## Detect model drift

In [9]:
import mlflow
import numpy as np
from xgboost import XGBRegressor
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
import math
from sklearn.metrics import r2_score
from mlflow import MlflowClient


def model_exist(model_name):

  client = MlflowClient() 

  try:
      print('existence check ...')
      client.get_latest_versions(odel_name, stages=["None"])

  except:
       print(odel_name, "***** Model does not exist ****** ", '\n')

       return False

  return True



def update_data():
     %run ./make_data_ready
     


def fetch_new_data():

  old_data= spark.read.options(header='True', inferSchema='True', delimiter=',').csv("/dbfs/mnt/data")
                                                                                    
  print('old data has been loaded ...')

  update_data()

  updated_data= spark.read.options(header='True', inferSchema='True', delimiter=',').csv("/dbfs/mnt/data")
  updated_dataset= updated_data.toPandas()

  print('updated data has been loaded  ...')


  if not updated_dataset.equals(old_dataset):

      new_records= pd.concat([updated_data, old_data]).drop_duplicates(keep=False)

      print('new data records ready ...')

      return new_records, updated_dataset  

  else:
      return   old_dataset, updated_dataset  



def model_drift_detector( df, model_name):
    

    new_records, updated_dataset =fetch_new_data(df)
 

    test_mape, test_r2= test_prediction(new_records, model_name)
    

    print('Test MAPE:', test_mape, '  Test R2:', test_r2)


    if test_mape >0.3 or test_r2 <0.85: 

        print('Drift detected: model should be retrained...')

        model_register (updated_dataset, model_name)

    else:
        
         print('No drift detected ...', '\n')

        
      
     


    
model_drift_detector(df, model_name)
    

