In [1]:
import os
from pathlib import Path

import pandas as pd
from alibi_detect.cd import TabularDrift
from joblib import load
from alibi_detect.saving import save_detector

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [7]:
proj_path = Path(os.getcwd()).parent.absolute()
model_path = proj_path/'models'/'clf-model.joblib'
model = load(model_path)

In [8]:
X_test = pd.read_pickle(proj_path/'data'/'processed'/'X_test.pkl')
X_train = pd.read_pickle(proj_path/'data'/'processed'/'X_train.pkl')

X_train.head()


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
2517,555,40,10,43028.77,1,1,0,170514.21
6487,608,38,9,102406.76,1,0,1,57600.66
2178,657,39,4,80293.81,1,1,0,97192.76
1414,586,35,7,0.0,2,1,0,70760.69
6228,407,37,1,0.0,1,1,1,49161.12


In [9]:
feat_cols = X_train.columns.tolist()

In [10]:
df_germany = pd.read_csv(proj_path/'data'/'more_data'/'Churn_Modelling_Germany.csv')
df_germany.shape

(2509, 13)

In [11]:
X_germany = df_germany[feat_cols]

In [12]:
preprocessor = model[:-1]

In [15]:
categories_per_feature = {i:None for i,k in enumerate(feat_cols) if k.startswith("cat__")}

cd = TabularDrift(X_train,
                  p_val=.05,
                  preprocess_fn=preprocessor.transform,
                  categories_per_feature=categories_per_feature)

In [18]:
preds = cd.predict(X_test)
labels = ['No!', 'Yes!']
print('Drift? {}'.format(labels[preds['data']['is_drift']]))

Drift? No!


In [20]:
preds

{'data': {'is_drift': 0,
  'distance': array([0.02295547, 0.01625597, 0.01873707, 0.02723647, 0.00164913,
         0.01795582, 0.00275539, 0.02602135], dtype=float32),
  'p_val': array([0.6866387 , 0.9588626 , 0.88434815, 0.4698443 , 1.        ,
         0.9122476 , 1.        , 0.52881515], dtype=float32),
  'threshold': 0.00625},
 'meta': {'name': 'TabularDrift',
  'online': False,
  'data_type': None,
  'version': '0.12.0',
  'detector_type': 'drift'}}

In [21]:
preds['data']['p_val']

array([0.6866387 , 0.9588626 , 0.88434815, 0.4698443 , 1.        ,
       0.9122476 , 1.        , 0.52881515], dtype=float32)

In [22]:
preds = cd.predict(X_germany)
labels = ['No!', 'Yes!']
print('Drift? {}'.format(labels[preds['data']['is_drift']]))

Drift? Yes!


In [23]:
preds

{'data': {'is_drift': 1,
  'distance': array([0.01917945, 0.07051123, 0.01686763, 0.4850008 , 0.03931354,
         0.01381451, 0.02402934, 0.02331847], dtype=float32),
  'p_val': array([5.1577330e-01, 3.1598873e-08, 6.7854744e-01, 0.0000000e+00,
         7.4593052e-03, 8.7728035e-01, 2.4575511e-01, 2.7731997e-01],
        dtype=float32),
  'threshold': 0.00625},
 'meta': {'name': 'TabularDrift',
  'online': False,
  'data_type': None,
  'version': '0.12.0',
  'detector_type': 'drift'}}

In [25]:
import datetime
now = datetime.datetime.now()

p_val = preds['data']['p_val']

df_p_val = pd.DataFrame([[now] + p_val.tolist()], columns=['time'] + feat_cols)
df_p_val

Unnamed: 0,time,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,2024-09-30 14:51:20.819774,0.515773,3.159887e-08,0.678547,0.0,0.007459,0.87728,0.245755,0.27732


In [26]:
detector_path = proj_path/'models'/'drift_detector'
save_detector(cd, detector_path)

Directory c:\Users\claus\Desktop\PSAIL\courses\oreilly\open-source-mlops-e2e\models\drift_detector does not exist and is now created.
Directory c:\Users\claus\Desktop\PSAIL\courses\oreilly\open-source-mlops-e2e\models\drift_detector\preprocess_fn does not exist and is now created.
