# Configuration

In [1]:
SYN_DATASET_SIZE = 1000000

BATCH_SIZE = 0.3
EVAL_SIZE = 0.7

N_BATCH = int(SYN_DATASET_SIZE * BATCH_SIZE)
N_EVAL = int(SYN_DATASET_SIZE * EVAL_SIZE)

CD_POSITION = N_BATCH + int(N_EVAL / 2)
CD_WIDTH = int(N_EVAL * 0.2)

# Model Training

In [8]:
from supervised.automl import AutoML

def train_model(X, y, model_name):
    model_path = f'../output/models/{model_name}'

    print(f'Dataset size: {X.shape}, {y.shape}')

    automl = AutoML(results_path=model_path)
    automl.fit(X, y)
    

In [9]:
from supervised.automl import AutoML
import numpy as np

def evaluate_model(X, y, model_name, model_type):
    model_path = f'../output/models/{model_name}'
    output_file = f'../data/{model_type}/{model_name.lower()}.feather'

    automl = AutoML(results_path=model_path)
    prob_0, prob_1 = zip(*automl.predict_proba(X))
    output_df = pd.DataFrame({'prob_negative': prob_0, 'label_negative': (y == 0).astype(int), 'prob_positive': prob_1, 'label_positive': y})
    output_df = output_df.reset_index().rename(columns={'index': 'timestamp'})
    output_df.to_feather(output_file)

# Custom Datasets

In [21]:
import gdown

apps = {'application_01': 'https://drive.google.com/uc?id=1ml3f-n9uWyTkLzRhtuEvDTBUl9SntFzY', 
        'application_02': 'https://drive.google.com/uc?id=1FMk7VyjG4SRfS2wTIWle5WEEFRXqAf7J', 
        'application_03': 'https://drive.google.com/uc?id=1gK7HtaB4AMKe1_Iv-vOmDu9Rq5aaY16J', 
        'application_04': 'https://drive.google.com/uc?id=1TmgRmw4C5gr982RwbY-6v5SzVFNXmhH9'}

for app, url in apps.items():
    gdown.download(url, f'../data/custom/{app}.feather', quiet=False)    

Downloading...
From: https://drive.google.com/uc?id=1ml3f-n9uWyTkLzRhtuEvDTBUl9SntFzY
To: /home/eferrj/work/high-confidence/data/custom/application_01.feather
100%|██████████| 2.87M/2.87M [00:00<00:00, 7.79MB/s]
Downloading...
From: https://drive.google.com/uc?id=1FMk7VyjG4SRfS2wTIWle5WEEFRXqAf7J
To: /home/eferrj/work/high-confidence/data/custom/application_02.feather
100%|██████████| 50.3M/50.3M [00:08<00:00, 5.63MB/s]
Downloading...
From: https://drive.google.com/uc?id=1gK7HtaB4AMKe1_Iv-vOmDu9Rq5aaY16J
To: /home/eferrj/work/high-confidence/data/custom/application_03.feather
100%|██████████| 6.03M/6.03M [00:00<00:00, 9.67MB/s]
Downloading...
From: https://drive.google.com/uc?id=1TmgRmw4C5gr982RwbY-6v5SzVFNXmhH9
To: /home/eferrj/work/high-confidence/data/custom/application_04.feather
100%|██████████| 17.9M/17.9M [00:02<00:00, 6.36MB/s]


# Synthetic Datasets

## SEA Generator

In [4]:
from river import synth
import pandas as pd


gen = synth.ConceptDriftStream(stream=synth.SEA(seed=23, variant=0),
                               drift_stream=synth.SEA(seed=23, variant=1),
                               seed=23, position=CD_POSITION, width=CD_WIDTH)
                               
dataset = gen.take(SYN_DATASET_SIZE)

data_dict = {'y': []}
for X, y in dataset:
    features = [x for x in X.keys()]
    for feature in features:
        if not feature in data_dict:
            data_dict[feature] = []
        data_dict[feature].append(X[feature])
    data_dict['y'].append(int(y))

df = pd.DataFrame(data_dict)

df_train, df_eval = df.iloc[:N_BATCH, :], df.iloc[N_BATCH:, :]
X_train, y_train = df_train[df_train.columns[1:]], df_train[df_train.columns[0]]
X_eval, y_eval = df_eval[df_eval.columns[1:]], df_eval[df_eval.columns[0]]

train_model(X_train, y_train, 'SEA_V0_V1')
evaluate_model(X_eval, y_eval, 'SEA_V0_V1', 'synthetic')

Dataset size: (300000, 3), (300000,)
Linear algorithm was disabled.
AutoML directory: ../output/models/SEA_V0_V1
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline logloss 0.62696 trained in 0.61 seconds
2_DecisionTree logloss 0.181799 trained in 24.37 seconds
* Step default_algorithms will try to check up to 3 models
3_Default_Xgboost logloss 0.003343 trained in 34.37 seconds
4_Default_NeuralNetwork logloss 0.001651 trained in 68.2 seconds
5_Default_RandomForest logloss 0.085254 trained in 13.03 seconds
* Step ensemble will try to check up to 1 model
Ensemble logloss 0.001651 trained in 1.3 seconds
AutoML fit time: 149.81 seconds
AutoML best model: 4_Default_NeuralNetwork


X has feature names, but StandardScaler was fitted without feature names


## Mixed Generator

In [5]:
from river import synth
import pandas as pd


gen_01 = synth.Mixed(seed=23, classification_function=0)
gen_02 = synth.Mixed(seed=23, classification_function=1)

CONCEPT_INTERVAL = 100000

data_dict = {'y': []}

dataset = gen_01.take(N_BATCH)

for X, y in dataset:
    features = [x for x in X.keys()]
    for feature in features:
        if not feature in data_dict:
            data_dict[feature] = []
        data_dict[feature].append(X[feature])
    data_dict['y'].append(int(y))

dataset = gen_01.take(CONCEPT_INTERVAL*2)

for X, y in dataset:
    features = [x for x in X.keys()]
    for feature in features:
        if not feature in data_dict:
            data_dict[feature] = []
        data_dict[feature].append(X[feature])
    data_dict['y'].append(int(y))


dataset = gen_02.take(CONCEPT_INTERVAL)

for X, y in dataset:
    features = [x for x in X.keys()]
    for feature in features:
        if not feature in data_dict:
            data_dict[feature] = []
        data_dict[feature].append(X[feature])
    data_dict['y'].append(int(y))

dataset = gen_01.take(CONCEPT_INTERVAL)

for X, y in dataset:
    features = [x for x in X.keys()]
    for feature in features:
        if not feature in data_dict:
            data_dict[feature] = []
        data_dict[feature].append(X[feature])
    data_dict['y'].append(int(y))

dataset = gen_02.take(CONCEPT_INTERVAL)

for X, y in dataset:
    features = [x for x in X.keys()]
    for feature in features:
        if not feature in data_dict:
            data_dict[feature] = []
        data_dict[feature].append(X[feature])
    data_dict['y'].append(int(y))


dataset = gen_01.take(CONCEPT_INTERVAL)

for X, y in dataset:
    features = [x for x in X.keys()]
    for feature in features:
        if not feature in data_dict:
            data_dict[feature] = []
        data_dict[feature].append(X[feature])
    data_dict['y'].append(int(y))

df = pd.DataFrame(data_dict)

df_train, df_eval = df.iloc[:N_BATCH, :], df.iloc[N_BATCH:, :]
X_train, y_train = df_train[df_train.columns[1:]], df_train[df_train.columns[0]]
X_eval, y_eval = df_eval[df_eval.columns[1:]], df_eval[df_eval.columns[0]]

train_model(X_train, y_train, 'mixed_recurrent')
evaluate_model(X_eval, y_eval, 'mixed_recurrent', 'synthetic')

Dataset size: (300000, 4), (300000,)
Linear algorithm was disabled.
AutoML directory: ../output/models/mixed_recurrent
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline logloss 0.691259 trained in 0.83 seconds
Exception while producing SHAP explanations. Passing parameters norm and vmin/vmax simultaneously is not supported. Please pass vmin/vmax directly to the norm when creating it.
Continuing ...
2_DecisionTree logloss 0.228663 trained in 19.65 seconds
* Step default_algorithms will try to check up to 3 models
Exception while producing SHAP explanations. Passing parameters norm and vmin/vmax simultaneously is not supported. Please pass vmin/vmax directly to the norm when creating it.
Continuin

X has feature names, but StandardScaler was fitted without feature names


# Real Datasets

## Elec2

In [6]:
from river.datasets import Elec2
import pandas as pd

elec2 = Elec2()
if not elec2.is_downloaded:
    elec2.download()

dataset = elec2.take(45_312)

data_dict = {'y': []}
for X, y in dataset:
    features = [x for x in X.keys()]
    for feature in features:
        if not feature in data_dict:
            data_dict[feature] = []
        data_dict[feature].append(X[feature])
    data_dict['y'].append(int(y))

df = pd.DataFrame(data_dict)

N_BATCH = int(df.shape[0] * 0.3)

df_train, df_eval = df.iloc[:N_BATCH, :], df.iloc[N_BATCH:, :]
X_train, y_train = df_train[df_train.columns[1:]], df_train[df_train.columns[0]]
X_eval, y_eval = df_eval[df_eval.columns[1:]], df_eval[df_eval.columns[0]]

train_model(X_train, y_train, 'elec2')
evaluate_model(X_eval, y_eval, 'elec2', 'real')

Downloading https://maxhalford.github.io/files/datasets/electricity.zip (697.72 KB)
Uncompressing into /home/eferrj/river_data/Elec2
Dataset size: (13593, 8), (13593,)
Linear algorithm was disabled.
AutoML directory: ../output/models/elec2
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline logloss 0.686983 trained in 0.26 seconds
2_DecisionTree logloss 0.375937 trained in 3.57 seconds
* Step default_algorithms will try to check up to 3 models
3_Default_Xgboost logloss 0.116597 trained in 12.7 seconds
4_Default_NeuralNetwork logloss 0.352825 trained in 4.81 seconds
5_Default_RandomForest logloss 0.347824 trained in 2.59 seconds
* Step ensemble will try to check up to 1 model
Ensemble logloss 0.116

## Airline Passengers

In [6]:
import requests

url = 'https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/master/airlines.csv'
r = requests.get(url, allow_redirects=True)

with open('../datasets/airlines.csv', 'wb') as f:
    f.write(r.content)

In [11]:
from river.datasets import AirlinePassengers
import pandas as pd


df = pd.read_csv('../datasets/airlines.csv')
df = df.rename(columns={'Delay': 'y'})

N_BATCH = int(df.shape[0] * 0.3)

df_train, df_eval = df.iloc[:N_BATCH, :], df.iloc[N_BATCH:, :]
X_train, y_train = df_train[[c for c in df_train.columns if c != 'y']], df_train['y']
X_eval, y_eval = df_eval[[c for c in df_eval.columns if c != 'y']], df_eval['y']

train_model(X_train, y_train, 'airlines')
evaluate_model(X_eval, y_eval, 'airlines', 'real')

Dataset size: (161814, 7), (161814,)
Linear algorithm was disabled.
AutoML directory: ../output/models/airlines
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline logloss 0.640783 trained in 0.45 seconds
2_DecisionTree logloss 0.603446 trained in 17.45 seconds
* Step default_algorithms will try to check up to 3 models
Exception while producing SHAP explanations. Passing parameters norm and vmin/vmax simultaneously is not supported. Please pass vmin/vmax directly to the norm when creating it.
Continuing ...
3_Default_Xgboost logloss 0.55983 trained in 22.01 seconds
4_Default_NeuralNetwork logloss 0.602219 trained in 71.06 seconds
5_Default_RandomForest logloss 0.596631 trained in 14.48 seconds
* S