In [1]:
from river.datasets import synth
import pandas as pd
from datasets_metadata import DATASETS_METADATA, SEED, BASE_CONCEPT_SIZE, NEW_CONCEPT_SIZE

# Abrupt and recurring drift

In [2]:
for metadata in DATASETS_METADATA:
    print(f'Generating {metadata["file_name"]}...')
    dataset = []
    for concept, params in enumerate(metadata['params']):
        config = params['config']
        dataset_gen = getattr(synth, metadata['dataset_name'])(**config)
        for x, y in dataset_gen.take(params['n_inst']):
            x['target'] = y
            x['concept'] = concept
            dataset.append(x)
    pd.DataFrame(dataset).to_csv(f'{metadata["file_name"]}.csv', index=False)

print('DONE')

Generating abrupt_sine_balanced...
Generating abrupt_sine_unbalanced...
Generating abrupt_sine_balanced_noise...
Generating abrupt_sine_unbalanced_noise...
Generating abrupt_recurring_sine_balanced...
Generating abrupt_sea...
Generating abrupt_sea_noise...
Generating abrupt_recurring_sea...
Generating abrupt_stagger_balanced...
Generating abrupt_stagger_unbalanced...
Generating abrupt_recurring_stagger_balanced...
Generating abrupt_mixed_balanced...
Generating abrupt_mixed_unbalanced...
Generating abrupt_recurring_mixed_balanced...
DONE


# Gradual drift

### Friedman drift
https://riverml.xyz/0.15.0/api/datasets/synth/FriedmanDrift/

In [3]:
TRANSITION_WINDOW = NEW_CONCEPT_SIZE/10

print(f'Generating gradual_friedman...')
dataset_gen = synth.FriedmanDrift(
    drift_type='gsg',
    position=(NEW_CONCEPT_SIZE, 2*NEW_CONCEPT_SIZE),
    seed=SEED,
    transition_window=TRANSITION_WINDOW,
)

dataset = []
concept_counter = 0
concept = 0

for x, y in dataset_gen.take(3*NEW_CONCEPT_SIZE):
    x['target'] = y
    x['concept'] = concept
    dataset.append(x)
    
    concept_counter +=1
    if concept_counter == NEW_CONCEPT_SIZE:
        concept_counter = 0
        concept += 1

pd.DataFrame(dataset).to_csv('gradual_friedman.csv', index=False)
concept

Generating gradual_friedman...


3

### Agrawal drift
https://riverml.xyz/0.15.0/api/datasets/synth/Agrawal/

In [6]:
def generate_agrawal(balance_classes: bool=True, perturbation: float=0):
    noise = '_with_noise' if perturbation > 0 else ''
    filename = f"gradual_agrawal_{'un' if not balance_classes else ''}balanced{noise}"
    dataset = []

    print(f'Generating {filename}.csv...')
    for concept in range(0, 10):
        dataset_gen = synth.Agrawal(classification_function=concept, seed=SEED, balance_classes=balance_classes, perturbation=perturbation)
        # The first concept will have more instances for the offline phase
        concept_size = BASE_CONCEPT_SIZE if concept == 0 else int(BASE_CONCEPT_SIZE/10)
        for x, y in dataset_gen.take(concept_size):
            x['target'] = y
            x['concept'] = concept
            dataset.append(x)
    pd.DataFrame(dataset).to_csv(f'{filename}.csv', index=False)

In [7]:
generate_agrawal(balance_classes=True, perturbation=0)
generate_agrawal(balance_classes=False, perturbation=0)
generate_agrawal(balance_classes=True, perturbation=0.5)
generate_agrawal(balance_classes=False, perturbation=0.5)

Generating gradual_agrawal_balanced.csv...
Generating gradual_agrawal_unbalanced.csv...
Generating gradual_agrawal_balanced_with_noise.csv...
Generating gradual_agrawal_unbalanced_with_noise.csv...


In [8]:
def generate_agrawal_with_noise(balance_classes: bool=True, classification_function: int=0, perturbation=(0, 0.5, 0.1, 1, 0.3, 0)):
    filename = f"gradual_agrawal_{'un' if not balance_classes else ''}balanced_increasing_noise"
    dataset = []

    print(f'Generating {filename}.csv...')
    for concept, p in enumerate(perturbation):
        dataset_gen = synth.Agrawal(classification_function=classification_function, seed=SEED, balance_classes=balance_classes, perturbation=p)
        # The first concept will have more instances for the offline phase
        concept_size = BASE_CONCEPT_SIZE if concept == 0 else int(BASE_CONCEPT_SIZE/10)

        for x, y in dataset_gen.take(concept_size):
            x['target'] = y
            x['concept'] = concept
            dataset.append(x)
    pd.DataFrame(dataset).to_csv(f'{filename}.csv', index=False)

In [9]:
generate_agrawal_with_noise(balance_classes=True)
generate_agrawal_with_noise(balance_classes=False)

Generating gradual_agrawal_balanced_increasing_noise.csv...
Generating gradual_agrawal_unbalanced_increasing_noise.csv...
