## Model monitoring: data drift detection

**Author:** Andrew Kruchko

**Contributors:** Thodoris Petropoulos

**Label:** Model Monitoring

**Scope**: The scope of this notebook is to provide instructions on how to detect data drift and replace the model in the deployment.

**Background**: The model performance depends on the data used for predictions. Data drift can lead to unreliable predictions which is why it should be monitored to the model replacement. 

**Considerations**: The endpoint "deployments/{deployment_id}/featureDrift/" is not yet documented publicly and is only visible in this internal build of the docs. It is subject to further possible breaking interface changes. It is labeled for release in version: v2.21

**Requirements:** Python 3.7; DataRobot API version 2.20

#### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import datarobot as dr
import yaml
import time
import json
import requests

from datetime import datetime

pd.options.display.max_columns = 100

#### Connect to DataRobot and Define Settings

In [2]:
dr.Client(config_path="../drconfig.yaml")

<datarobot.rest.RESTClientObject at 0x1101c7490>

In [3]:
with open("../drconfig.yaml", 'r') as stream:
    creds = yaml.safe_load(stream)
token = creds['token']
base_url = creds['base_url']

project_name = '10K_Lending_Club_Loans'
target = 'is_bad'
metric = 'LogLoss'

#### 1. create data sources

In [4]:
def dr_rest_call(url, req_func, payload=None):
    headers = {'Authorization': f'Token {token}',
               'Content-Type': 'application/json;charset=UTF-8'}
    return req_func(f'{base_url}{url}', headers=headers, json=payload)

##### 1.1 find Microsoft SQL Server jdbc driver among predefined

In [5]:
drivers = dr_rest_call('externalDataDrivers', requests.get)

drivers = drivers.json()
drivers_sql_serv = []
for driver in drivers['data']:
    if 'Microsoft SQL Server' in driver['canonicalName']:
        drivers_sql_serv.append([driver['canonicalName'], driver['version'], driver['id']])

In [6]:
# let's use the latest
driver_sql_serv_id = drivers_sql_serv[-1][-1]

##### 1.2 create a data store

In [7]:
data = {'type': 'jdbc', 
        'canonicalName': 'sql_server_lc', 
        'params': {'driverId': driver_sql_serv_id, 
                   'jdbcFields': 
                   [{'name': 'address', 'value': creds['db_address']},
                    {'name': 'databaseName', 'value': creds['db_name']}]
                  }}

data_store_resp = dr_rest_call('externalDataStores', requests.post, payload=data)

In [8]:
# get data store id
data_store = data_store_resp.json()
data_store_id = data_store['id']

##### 1.3 create data sources

In [9]:
# create a data source based on query 
query = """
select *
from drdemodb1.cfds_demo.Credit_Analysis_Lending_Club
where annual_inc <= 90000
"""
# where addr_state not in ('CA', 'FL')
data = {'type': 'jdbc', 
        'canonicalName': 'Lending_Club_query', 
        'params': {'dataStoreId': data_store_id, 
                   'query': query}}

data_src_query_resp = dr_rest_call('externalDataSources', requests.post, payload=data)

data_src_query = data_src_query_resp.json()
data_src_query_id = data_src_query['id']

In [10]:
# create a data source based on table 
data = {'type': 'jdbc', 
        'canonicalName': 'Lending_Club_table', 
        'params': {'dataStoreId': data_store_id, 
                   'schema': 'cfds_demo',
                   'table': 'Credit_Analysis_Lending_Club'}}

data_src_table_resp = dr_rest_call('externalDataSources', requests.post, payload=data)

data_src_table = data_src_table_resp.json()
data_src_table_id = data_src_table['id']

#### 2. modeling

In [11]:
def wait_for_proj_id(resp):
    """
    wait for the project creation
    return the project id
    """
    while True:
        resp_stat = requests.get(resp.headers['Location'], 
                                 headers={'Authorization': f'Token {token}', 
                                          'Content-Type': 'application/json;charset=UTF-8'})
        resp_stat = resp_stat.json()

        if resp_stat.get('id') is None:
            time.sleep(10)
        else:
            proj_id = resp_stat.get('id')
            break
        
    return proj_id

In [12]:
# create a project based on the data source
data = {'projectName': f'{project_name}_query',
        'dataSourceId': data_src_query_id, 
        'user': creds['db_user'],
        'password': creds['db_pass']}

project_resp = dr_rest_call('projects', requests.post, payload=data)

project_id = wait_for_proj_id(project_resp)

In [13]:
# find the project trough python API 
projects = dr.Project.list()
project = [pr for pr in projects if pr.id == project_id][0]

In [14]:
# set target and run autopilot
project.set_target(target=target,
                   mode=dr.enums.AUTOPILOT_MODE.QUICK,
                   metric=metric,
                   worker_count=-1)

Project(10K_Lending_Club_Loans_query)

In [None]:
project.wait_for_autopilot()

In [16]:
model = dr.ModelRecommendation.get(project.id).get_model()

In [17]:
# recommended model results
print('AUC:', model.metrics['AUC']['crossValidation'])
print('LogLoss:', model.metrics['LogLoss']['crossValidation'])

AUC: 0.683502
LogLoss: 0.37276000000000004


#### 3. deployment

In [18]:
def predict_deployment(data, datarobot_key, deployment_url, deployment_id):
    # Set HTTP headers. The charset should match the contents of the file.
    headers = {'Content-Type': 'application/json; charset=UTF-8', 'datarobot-key': datarobot_key}

    url = f'{deployment_url}/predApi/v1.0/deployments/{deployment_id}/predictions'
    
    # Make API request for predictions
    predictions_response = requests.post(
        url,
        auth=(creds['username'], creds['token']),
        data=data,
        headers=headers,
    )

    return predictions_response.json()

In [19]:
# deploy the model
deployment = dr.Deployment.create_from_learning_model(model_id=model.id, 
                                                      label=f'{project_name}_clf_depl',
                                                      default_prediction_server_id=creds['pred_serv_id'])

In [20]:
deployment.update_drift_tracking_settings(feature_drift_enabled=True)

In [21]:
# get prediction server url, deployment id and DataRobot key
pred_server = deployment.default_prediction_server

dr_key = pred_server['datarobot-key']
depl_url = pred_server['url']
depl_id = deployment.id

In [22]:
# read and prepare a dataset to score
df_scoring = pd.read_csv('data/10K_Lending_Club_Loans_annual_inc_above_90k.csv')
print(df_scoring.shape)
data_to_pred = json.dumps(df_scoring.to_dict(orient='records'))

(1907, 34)


In [23]:
# get predictions
print(str(datetime.now()))
preds_raw = predict_deployment(data_to_pred, dr_key, depl_url, depl_id)
print(str(datetime.now()))

2020-05-12 12:10:39.445159
2020-05-12 12:10:44.139796


In [26]:
# Warning
# The following endpoint is not yet documented publicly and is only visible in this internal build of the docs. 
# It is subject to further possible breaking interface changes. 
# It is labeled for release in version: v2.21

# detect data drift
data = {'limit': 10}
drift_check = dr_rest_call(f'deployments/{deployment.id}/featureDrift/', requests.get, data).json()

In [27]:
drift_results = []
for drift in drift_check['data']:
    if drift['featureImpact'] > 0.5 and drift['driftScore'] > 0.5:
        drift_results.append(drift)

In [28]:
drift_results

[{'featureImpact': 0.602255298259249,
  'sampleSize': 1907,
  'name': 'annual_inc',
  'baselineSampleSize': 6474,
  'driftScore': 14.165824381181023}]

#### 4. model replacement

In [29]:
if len(drift_results) > 0:
    # create a project based on the data source
    data = {'projectName': f'{project_name}_table',
            'dataSourceId': data_src_table_id, 
            'user': creds['db_user'],
            'password': creds['db_pass']}

    project_tbl_resp = dr_rest_call('projects', requests.post, payload=data)

    project_tbl_id = wait_for_proj_id(project_tbl_resp)
    
    # find the project trough python API 
    projects = dr.Project.list()
    project_tbl = [pr for pr in projects if pr.id == project_tbl_id][0]
    
    # set target and run autopilot
    project_tbl.set_target(target=target,
                           mode=dr.enums.AUTOPILOT_MODE.QUICK,
                           metric=metric,
                           worker_count=-1)
    
    project_tbl.wait_for_autopilot()
    
    model_tbl = dr.ModelRecommendation.get(project_tbl.id).get_model()
    
    # recommended models results
    print('AUC:', model_tbl.metrics['AUC']['crossValidation'])
    print('LogLoss:', model_tbl.metrics['LogLoss']['crossValidation'])
    
    status, message, checks = deployment.validate_replacement_model(new_model_id=model_tbl.id)
    print(status)
    
    print('current model: ', deployment.model['id'], deployment.model['type'])
    
    deployment.replace_model(model_tbl.id, dr.enums.MODEL_REPLACEMENT_REASON.DATA_DRIFT)
    print('replaced model:', deployment.model['id'], deployment.model['type'])

AUC: 0.6976519999999999
LogLoss: 0.35642199999999996
passing
current model:  *******f6 Elastic-Net Classifier (L2 / Binomial Deviance)
replaced model: *******70 Elastic-Net Classifier (L2 / Binomial Deviance)
