In [2]:
import os
from pathlib import Path

import pandas as pd
from alibi_detect.cd import TabularDrift
from joblib import load
from alibi_detect.saving import save_detector

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


### Load model 

In [3]:
proj_path = Path(os.getcwd()).parent.absolute()

model_path = proj_path/'models'/'clf-model.joblib'
model = load(model_path)

### Load train and test data

In [4]:
X_test = pd.read_pickle(proj_path/'data'/'processed'/'X_test.pkl')
X_train = pd.read_pickle(proj_path/'data'/'processed'/'X_train.pkl')

In [5]:
X_train.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
3197,684,33,4,140700.61,1,1,0,103557.93
1915,725,44,10,0.0,1,0,1,93777.61
1298,614,19,5,97445.49,2,1,0,122823.34
426,515,65,7,92113.61,1,1,1,142548.33
831,599,28,4,126833.79,2,1,0,60843.09


In [6]:
feat_cols = X_train.columns.tolist()
feat_cols

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

### Load data from new geography (Germany) that the model has never seen

In [7]:
df_germany = pd.read_csv(proj_path/'data'/'more_data'/'Churn_Modelling_Germany.csv')
df_germany.shape

(2509, 13)

In [8]:
X_germany = df_germany[feat_cols]

### Train drift detection model

In [9]:
# need preprocessor from sklearn pipeline 
# in order to process the data the exact same way as it was during training
preprocessor = model[:-1]

In [10]:
categories_per_feature = {i:None for i,k in enumerate(feat_cols) if k.startswith('cat__')}

cd = TabularDrift(X_train, 
                  p_val=.05, 
                  preprocess_fn=preprocessor.transform,
                  categories_per_feature=categories_per_feature)

### Will there be drift if data was unseen by the model (test data), but it comes from the same geographies (France and Spain)?

In [11]:
preds = cd.predict(X_test)
labels = ['No!', 'Yes!']
print('Drift? {}'.format(labels[preds['data']['is_drift']]))

Drift? No!


In [14]:
preds

{'data': {'is_drift': 0,
  'distance': array([2.0789672e-02, 2.0576468e-02, 1.0771311e-02, 1.9697266e-02,
         1.3676674e-02, 2.1381967e-02, 1.4028044e-05, 2.9303135e-02],
        dtype=float32),
  'p_val': array([0.6699497 , 0.682378  , 0.9988385 , 0.73309815, 0.9761697 ,
         0.6353601 , 1.        , 0.24971026], dtype=float32),
  'threshold': 0.00625},
 'meta': {'name': 'TabularDrift',
  'online': False,
  'data_type': None,
  'version': '0.10.4',
  'detector_type': 'drift'}}

In [15]:
preds['data']['p_val']

array([0.6699497 , 0.682378  , 0.9988385 , 0.73309815, 0.9761697 ,
       0.6353601 , 1.        , 0.24971026], dtype=float32)

In [16]:
X_train.columns

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary'],
      dtype='object')

### Will there be drift if data comes from a different geography (Germany) when what the model was trained on?

In [17]:
preds = cd.predict(X_germany)
labels = ['No!', 'Yes!']
print('Drift? {}'.format(labels[preds['data']['is_drift']]))

Drift? Yes!


In [18]:
preds

{'data': {'is_drift': 1,
  'distance': array([0.02026084, 0.06799088, 0.0161218 , 0.48564753, 0.0363293 ,
         0.01539897, 0.02361871, 0.02235417], dtype=float32),
  'p_val': array([4.5646867e-01, 1.4993016e-07, 7.4128473e-01, 0.0000000e+00,
         1.8338753e-02, 7.8984088e-01, 2.7316055e-01, 3.3505112e-01],
        dtype=float32),
  'threshold': 0.00625},
 'meta': {'name': 'TabularDrift',
  'online': False,
  'data_type': None,
  'version': '0.10.4',
  'detector_type': 'drift'}}

In [19]:
p_val = preds['data']['p_val']
p_val

array([4.5646867e-01, 1.4993016e-07, 7.4128473e-01, 0.0000000e+00,
       1.8338753e-02, 7.8984088e-01, 2.7316055e-01, 3.3505112e-01],
      dtype=float32)

In [20]:
import datetime
now = datetime.datetime.now()

df_p_val = pd.DataFrame([[now] + p_val.tolist()], columns=['time'] + feat_cols)
df_p_val

Unnamed: 0,time,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,2022-11-29 17:51:50.427636,0.456469,1.499302e-07,0.741285,0.0,0.018339,0.789841,0.273161,0.335051


### Save drift detector

In [21]:
detector_path = proj_path/'models'/'drift_detector'
save_detector(cd, detector_path)

Directory /workspace/open-source-mlops-e2e/models/drift_detector does not exist and is now created.
Directory /workspace/open-source-mlops-e2e/models/drift_detector/preprocess_fn does not exist and is now created.
