In [62]:
import openml
from sklearn.datasets import fetch_openml
import pandas as pd

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

from river import datasets

import numpy as np

In [103]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), make_column_selector(dtype_exclude=['object', 'category'])),
    ('cat', OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=['object', 'category']))
])

In [36]:
shapes = []
for i, (key, item) in enumerate(openml.datasets.list_datasets(tag='OpenML-CC18').items()):
    print(f"{i}:  {item['name']}")
    if item['name'] not in ('CIFAR_10', 'Devnagari-Script', 'Fashion-MNIST', 'mnist_784'):

        data = fetch_openml(data_id = item['did'])
        X = data['data']
        y = data['target']

        feature_names = data['feature_names']
        target_names = data['target_names']

        df = pd.concat([X, y], axis = 1)
        df = df.rename(columns = {data['target_names'][0]: 'y_target_class'})
        df = df.dropna()
        df = df.sample(frac=1).reset_index(drop=True)

        if df.shape[0] > 0:
            label_encoder = LabelEncoder()
            df['y_target_class'] = label_encoder.fit_transform(df['y_target_class'])

            X_transformed = preprocessor.fit_transform(df.drop(columns=['y_target_class']))
            y_transformed = df['y_target_class']

            df = pd.concat([pd.DataFrame(X_transformed), y_transformed], axis = 1)

            df.to_csv(f"../data/CC18/{item['name']}.csv", index = False)

            shapes.append(df.shape)

  for i, (key, item) in enumerate(openml.datasets.list_datasets(tag='OpenML-CC18').items()):


0:  kr-vs-kp
1:  letter
2:  balance-scale
3:  mfeat-factors
4:  mfeat-fourier
5:  breast-w
6:  mfeat-karhunen
7:  mfeat-morphological
8:  mfeat-zernike
9:  cmc
10:  optdigits
11:  credit-approval
12:  credit-g
13:  pendigits
14:  diabetes
15:  sick
16:  spambase
17:  splice
18:  tic-tac-toe
19:  vehicle
20:  electricity
21:  satimage
22:  eucalyptus
23:  isolet
24:  vowel
25:  analcatdata_authorship
26:  analcatdata_dmft
27:  mnist_784
28:  pc4
29:  pc3
30:  jm1
31:  kc2
32:  kc1
33:  pc1
34:  bank-marketing
35:  banknote-authentication
36:  blood-transfusion-service-center
37:  cnae-9
38:  first-order-theorem-proving
39:  har
40:  ilpd
41:  madelon
42:  nomao
43:  ozone-level-8hr
44:  phoneme
45:  qsar-biodeg
46:  wall-robot-navigation
47:  semeion
48:  wdbc
49:  adult
50:  Bioresponse
51:  PhishingWebsites
52:  GesturePhaseSegmentationProcessed
53:  cylinder-bands
54:  dresses-sales
55:  numerai28.6
56:  texture
57:  connect-4
58:  dna
59:  churn
60:  Devnagari-Script
61:  CIFAR_10
6

In [71]:
river_path = '/home/chacon/river_data'

river_datasets = {
    'credit_card': {
        'path': f'{river_path}/CreditCard/creditcard.csv',
        'time_feature': 'Time',
        'time_feature_dtype': np.float64,
        'y': 'Class',
        'drop_columns': None
    },
    'electricity': {
        'path': f'{river_path}/Elec2/electricity.csv',
        'time_feature': 'date',
        'time_feature_dtype': np.float64,
        'y': 'class',
        'drop_columns': None
    }
}


In [72]:
river_datasets

{'credit_card': {'path': '/home/chacon/river_data/CreditCard/creditcard.csv',
  'time_feature': 'Time',
  'time_feature_dtype': numpy.float64,
  'y': 'Class',
  'drop_columns': None},
 'electricity': {'path': '/home/chacon/river_data/Elec2/electricity.csv',
  'time_feature': 'date',
  'time_feature_dtype': numpy.float64,
  'y': 'class',
  'drop_columns': None}}

In [73]:
# Define the base and drift streams
base_stream = datasets.synth.Agrawal(classification_function=0, seed=42)
drift_stream = datasets.synth.Agrawal(classification_function=1, seed=42)

# Create the ConceptDriftStream
drift_dataset = datasets.synth.ConceptDriftStream(
    stream=base_stream,
    drift_stream=drift_stream,
    position=5000,
    width=1000
)

# Collect data
samples = []
for (x, y) in drift_dataset.take(10000):
    sample = {**x, "target": y}
    samples.append(sample)

# Convert to Pandas DataFrame
df_concept_drift_stream = pd.DataFrame(samples)

In [74]:
df_concept_drift_stream

Unnamed: 0,salary,commission,age,elevel,car,zipcode,hvalue,hyears,loan,target
0,103125.483800,0.000000,21,2,8,3,3.197690e+05,4,338349.743711,1
1,135983.343802,0.000000,25,4,14,0,4.238378e+05,7,116330.446695,1
2,98262.434776,0.000000,55,1,18,6,1.440881e+05,19,139095.354115,0
3,133009.041703,0.000000,68,1,14,5,2.333614e+05,7,478606.536103,1
4,63757.290865,16955.938254,26,2,12,4,5.228513e+05,24,229712.439836,1
...,...,...,...,...,...,...,...,...,...,...
9995,73958.888872,17955.682033,69,2,8,6,1.068675e+05,17,246361.294891,1
9996,25406.459491,80204.073622,28,0,5,0,1.121905e+06,29,111747.331643,0
9997,102405.265216,0.000000,33,4,18,1,1.010452e+06,6,5703.280711,0
9998,138684.334719,0.000000,59,3,13,7,9.286104e+04,8,221437.693253,0


In [82]:
# Define the base and drift streams
base_stream = datasets.synth.Hyperplane(seed=42)
drift_stream = datasets.synth.Hyperplane(seed=43)

# Create the ConceptDriftStream
drift_dataset = datasets.synth.ConceptDriftStream(
    stream=base_stream,
    drift_stream=drift_stream,
    position=5000,  # Point where drift happens
    width=1000  # Transition width
)

# Collect data
samples = []
for (x, y) in drift_dataset.take(10000):
    sample = {**x, "target": y}
    samples.append(sample)

# Convert to Pandas DataFrame
df_hyperplanes = pd.DataFrame(samples)

In [83]:
df_hyperplanes

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,target
0,0.218638,0.505355,0.026536,0.198838,0.649884,0.544941,0.220441,0.589266,0.809430,0.006499,0
1,0.155479,0.957213,0.336595,0.092746,0.096716,0.847494,0.603726,0.807128,0.729732,0.536228,0
2,0.829405,0.618520,0.861707,0.577352,0.704572,0.045824,0.227898,0.289388,0.079792,0.232791,0
3,0.364832,0.370181,0.209507,0.266978,0.936655,0.648035,0.609131,0.171139,0.729127,0.163402,1
4,0.556950,0.684614,0.842852,0.776000,0.229048,0.032100,0.315453,0.267741,0.210983,0.942910,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,0.554937,0.056220,0.900519,0.401132,0.461555,0.226459,0.635283,0.531701,0.290161,0.050500,0
9996,0.931941,0.286453,0.384279,0.437470,0.345432,0.144068,0.385008,0.211677,0.876050,0.492214,0
9997,0.639172,0.354614,0.301393,0.990236,0.575704,0.718551,0.346204,0.986871,0.247416,0.954042,1
9998,0.413878,0.751436,0.899097,0.393631,0.869690,0.318719,0.373261,0.489394,0.728662,0.346520,1


In [None]:
from river import datasets
import pandas as pd

def generate_dataset(dataset_name, params, n_samples=10000):
    dataset_class = getattr(datasets.synth, dataset_name)
    dataset = dataset_class(**params)

    data = []
    for x, y in dataset.take(n_samples):
        row = {**x, 'target': y}
        data.append(row)

    return pd.DataFrame(data)

# Define dataset configurations for different types of drift
configs = [
    {"name": "Hyperplane", "params": {"n_features": 10, "noise_percentage": 0.1, "seed": 42, "mag_change": 0.0, "n_drift_features": 2}},
    {"name": "Hyperplane", "params": {"n_features": 10, "noise_percentage": 0.3, "seed": 43, "mag_change": 0.0, "n_drift_features": 2}},  # More noise -> Concept drift
    {"name": "Hyperplane", "params": {"n_features": 15, "noise_percentage": 0.1, "seed": 44, "mag_change": 0.1, "n_drift_features": 3}},
    {"name": "Hyperplane", "params": {"n_features": 15, "noise_percentage": 0.3, "seed": 45, "mag_change": 0.2, "n_drift_features": 3}},
    {"name": "Hyperplane", "params": {"n_features": 20, "noise_percentage": 0.1, "seed": 46, "mag_change": 0.5, "n_drift_features": 4}},
    {"name": "Hyperplane", "params": {"n_features": 20, "noise_percentage": 0.3, "seed": 47, "mag_change": 1.0, "n_drift_features": 4}},
    {"name": "SEA", "params": {"variant": 1, "seed": 42}},
    {"name": "SEA", "params": {"variant": 2, "seed": 43}},  # Different variant -> Virtual drift
    {"name": "SEA", "params": {"variant": 3, "seed": 44}},
    {"name": "Waveform", "params": {"seed": 42}},
    {"name": "Waveform", "params": {"seed": 99}},  # Different seed -> Distribution drift
]

# Generate and save datasets
for config in configs:
    df = generate_dataset(config["name"], config["params"], n_samples=10000)

    X = df.drop(columns = ['target'])
    y = df['target']

    X_preproc = preprocessor.fit_transform(X)

    if config['name'] == 'SEA':
        label_encoder = LabelEncoder()
        y = label_encoder.fit_transform(y)

    df = pd.concat([pd.DataFrame(X_preproc), pd.Series(y, name = 'target')],axis = 1)

    filename = f"../data/synth_river/{config['name'].lower()}_seed{config['params'].get('seed', 'default')}.csv"
    df.to_csv(filename, index=False)
    print(f"Saved {filename}")


Saved ../data/synth_river/hyperplane_seed42.csv
Saved ../data/synth_river/hyperplane_seed43.csv
Saved ../data/synth_river/hyperplane_seed44.csv
Saved ../data/synth_river/hyperplane_seed45.csv
Saved ../data/synth_river/hyperplane_seed46.csv
Saved ../data/synth_river/hyperplane_seed47.csv
Saved ../data/synth_river/sea_seed42.csv
Saved ../data/synth_river/sea_seed43.csv
Saved ../data/synth_river/sea_seed44.csv
Saved ../data/synth_river/waveform_seed42.csv
Saved ../data/synth_river/waveform_seed99.csv


In [114]:
pd.read_csv('../data/synth_river/hyperplane_seed42.csv')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,target
0,-0.980123,0.017896,-1.650795,-1.045993,0.530771,0.147336,-0.965058,0.319726,1.080284,-1.708718,0
1,-1.199168,1.580653,-0.577255,-1.413347,-1.388897,1.199988,0.357753,1.072604,0.805302,0.121849,0
2,1.138135,0.409277,1.240881,0.264653,0.720553,-1.589210,-0.939320,-0.716575,-1.437166,-0.926728,0
3,-0.473093,-0.449606,-1.017280,-0.810050,1.525953,0.506024,0.376407,-1.125214,0.803215,-1.166511,1
4,0.193208,0.637866,1.175598,0.952492,-0.929664,-1.636960,-0.637147,-0.791382,-0.984522,1.527204,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,-1.489241,1.672315,0.397293,1.449290,-0.821015,-1.500984,-0.674113,-0.588615,-0.905076,1.161368,0
9996,-1.570047,0.875032,1.623647,-1.340829,0.834514,-0.741625,-1.511492,0.442452,-0.623104,-0.642215,1
9997,-1.080932,-1.007494,1.061026,-0.646709,1.606839,0.564462,1.008903,-0.654266,-1.126740,1.315694,0
9998,1.445211,-0.135788,-1.564446,0.742512,1.113042,-1.724706,0.869708,0.139887,0.821174,-1.386227,1
