In [85]:
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
from sklearn.metrics import confusion_matrix, classification_report
import os

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [13]:
CLF_METRICS = ["kappa"] #, "precision", "recall", "f1-score"]
CLF_MODELS = ["RandomForestClassifier"] #, "DecisionTreeClassifier", "LogisticRegression", "SVC"]

REG_METRICS = ["r2"] # , "mse", "std"]
REG_MODELS = ["RandomForestRegressor"] #, "DecisionTreeRegressor"]

DATASETS = [file for file in os.listdir("../../datasets/synthetic/") if "csv" in file]
REG_DATASETS = ["gradual_friedman.csv"]
CLF_DATASETS = list(set(DATASETS).difference(set(REG_DATASETS)))

In [4]:
def get_concept(original_idx: pd.Series, dataset: str, concept_type="last"):
    """Read the original dataframe to recover the context

    Args:
        original_idx (pd.Series): maps the metabase indexes to the original base indexes (before windowing)
        dataset (str): dataset name to be loaded
        concept_type (str, optional): how to set the window concept, it can be:
            - "last": The last concept of the batch
            - "most_frequent": The most frequent concept of the batch
    """
    filename_concept = f'../../datasets/synthetic/{dataset}'
    df_concept = pd.read_csv(filename_concept)['concept']

    concepts = []
    for idx in original_idx:
        bounds = [int(bound) for bound in idx.split(':')]
        concept_batch = df_concept[bounds[0]:bounds[1]]
        
        if concept_type == "last":
            concepts.append(concept_batch.values[-1])
        else:
            most_frequent_concept = concept_batch.value_counts().idxmax()
            concepts.append(most_frequent_concept)
    return concepts

def load_data(dataset, model, concept_type="last"):
    filename_mtl = f'../metabases/basemodel: {model} - dataset: {dataset} - with_drift_metrics.csv'
    results = pd.read_csv(filename_mtl)
    concept = get_concept(results['original_idx'], dataset, concept_type)
    results['concept'] = concept
    return results



In [42]:
def remove_unused_cols(df: pd.DataFrame) -> pd.DataFrame:
    metrics = CLF_METRICS if "kappa" in df.columns else REG_METRICS
    cols_to_keep = [
        "concept",  # concept
        *metrics,  # target cols
        *[f"last_{col}" for col in metrics],  # baseline cols
        *[col for col in df.columns if "drift" in col],  # drift metrics cols
        *[col for col in df.columns if "predicted" in col],  # mtl cols
    ]
    return df[cols_to_keep]

def gen_drift_alerts(dataset: str, model: str, metric: str) -> pd.DataFrame:
    df = load_data(dataset, model)
    offline_df = df[df["data_type"] == "offline"]
    online_df = df[df["data_type"] == "online"]
    online_df = remove_unused_cols(online_df)

    expected_mean, expected_std = offline_df[metric].mean(), offline_df[metric].std()

    for sensitivity in range(1, 20):
        threshold = expected_mean - expected_std * sensitivity
        online_df[f"mtl_alert_{metric}_s={sensitivity}"] = (online_df[f"predicted_{metric}"] <= threshold).astype(int)
        online_df[f"baseline_alert_{metric}_s={sensitivity}"] = (online_df[f"last_{metric}"] <= threshold).astype(int)
    online_df["target_alert"] = (online_df["concept"] > 0).astype(int)
    return online_df


In [44]:
results = {}

for dataset in DATASETS:
    try:
        metric = "kappa" if dataset in CLF_DATASETS else "r2"
        model = "RandomForestClassifier" if dataset in CLF_DATASETS else "RandomForestRegressor"
        results[dataset] = gen_drift_alerts(dataset, model, metric)
    except:
        print(dataset)

sine_unbalanced.csv
sine_balanced_noise.csv
sine_balanced.csv
SEA.csv
SEA_noise.csv
Mixed_balanced.csv
Mixed_unbalanced.csv
sine_unbalanced_noise.csv
STAGGER_unbalanced.csv
STAGGER_balanced.csv


In [87]:
def get_perf_metrics(df: pd.DataFrame) -> pd.DataFrame:
    alert_cols = [c for c in df.columns if ("_drift_" in c or "_alert" in c)]
    results = []

    for col in alert_cols:
        sensitivity = col.split("s=")[1] if "s=" in col else None
        alert_type = "drift_metrics" if "_drift_" in col else col.split("_")[0]

        # Generate confusion matrix
        cm = confusion_matrix(df['target_alert'], df[col], labels = [0, 1]).ravel()
        results.append({
            'alert_col': col,
            'type': alert_type,
            'sensitivity': sensitivity,
            'True Negatives': cm[0], 
            'False Positives': cm[1], 
            'False Negatives': cm[2], 
            'True Positives': cm[3],
            **classification_report(df['target_alert'], df[col], labels = [0, 1], output_dict=True)['1']
        })
    return pd.DataFrame(results)

final_performances = {}
for dataset, df in results.items():
    print(dataset)
    df = results[dataset]
    final_performances[dataset] = get_perf_metrics(df)

gradual_agrawal_unbalanced_with_noise.csv
gradual_agrawal_unbalanced_increasing_noise.csv
gradual_agrawal_balanced.csv
gradual_agrawal_unbalanced.csv
gradual_agrawal_balanced_with_noise.csv
gradual_agrawal_balanced_increasing_noise.csv
gradual_friedman.csv


In [94]:
final_performances["gradual_friedman.csv"].sort_values(by="f1-score", ascending=False)

Unnamed: 0,alert_col,type,sensitivity,True Negatives,False Positives,False Negatives,True Positives,precision,recall,f1-score,support
41,target_alert,target,,118,0,0,272,1.0,1.0,1.0,272
19,mtl_alert_r2_s=9,mtl,9.0,113,5,4,268,0.981685,0.985294,0.983486,272
21,mtl_alert_r2_s=10,mtl,10.0,117,1,8,264,0.996226,0.970588,0.98324,272
17,mtl_alert_r2_s=8,mtl,8.0,109,9,4,268,0.967509,0.985294,0.976321,272
23,mtl_alert_r2_s=11,mtl,11.0,117,1,12,260,0.996169,0.955882,0.97561,272
25,mtl_alert_r2_s=12,mtl,12.0,118,0,13,259,1.0,0.952206,0.975518,272
29,mtl_alert_r2_s=14,mtl,14.0,118,0,14,258,1.0,0.948529,0.973585,272
27,mtl_alert_r2_s=13,mtl,13.0,118,0,14,258,1.0,0.948529,0.973585,272
9,mtl_alert_r2_s=4,mtl,4.0,102,16,0,272,0.944444,1.0,0.971429,272
11,mtl_alert_r2_s=5,mtl,5.0,102,16,0,272,0.944444,1.0,0.971429,272
