In [1]:
import openml
from sklearn.datasets import fetch_openml
import pandas as pd

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

import numpy as np

from scipy.io.arff import loadarff

import os

In [2]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), make_column_selector(dtype_exclude=['object', 'category'])),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output = False), make_column_selector(dtype_include=['object', 'category']))
])

In [3]:
shapes = []
for i, (key, item) in enumerate(openml.datasets.list_datasets(tag='OpenML-CC18').items()):
    print(f"{i}:  {item['name']}")
    if item['name'] not in ('CIFAR_10', 'Devnagari-Script', 'Fashion-MNIST', 'mnist_784'):

        data = fetch_openml(data_id = item['did'])
        X = data['data']
        y = data['target']

        feature_names = data['feature_names']
        target_names = data['target_names']

        df = pd.concat([X, y], axis = 1)
        df = df.rename(columns = {data['target_names'][0]: 'y_target_class'})
        df = df.dropna()
        df = df.sample(frac=1).reset_index(drop=True)

        if df.shape[0] > 0:
            label_encoder = LabelEncoder()
            df['y_target_class'] = label_encoder.fit_transform(df['y_target_class'])

            X_transformed = preprocessor.fit_transform(df.drop(columns=['y_target_class']))
            y_transformed = df['y_target_class']

            df = pd.concat([pd.DataFrame(X_transformed), y_transformed], axis = 1)

            df.to_csv(f"../data/CC18/{item['name']}.csv", index = False)

            shapes.append(df.shape)

  for i, (key, item) in enumerate(openml.datasets.list_datasets(tag='OpenML-CC18').items()):


0:  kr-vs-kp
1:  letter
2:  balance-scale
3:  mfeat-factors
4:  mfeat-fourier
5:  breast-w
6:  mfeat-karhunen
7:  mfeat-morphological
8:  mfeat-zernike
9:  cmc
10:  optdigits
11:  credit-approval
12:  credit-g
13:  pendigits
14:  diabetes
15:  sick
16:  spambase
17:  splice
18:  tic-tac-toe
19:  vehicle
20:  electricity
21:  satimage
22:  eucalyptus
23:  isolet
24:  vowel
25:  analcatdata_authorship
26:  analcatdata_dmft
27:  mnist_784
28:  pc4
29:  pc3
30:  jm1
31:  kc2
32:  kc1
33:  pc1
34:  bank-marketing
35:  banknote-authentication
36:  blood-transfusion-service-center
37:  cnae-9
38:  first-order-theorem-proving
39:  har
40:  ilpd
41:  madelon
42:  nomao
43:  ozone-level-8hr
44:  phoneme
45:  qsar-biodeg
46:  wall-robot-navigation
47:  semeion
48:  wdbc
49:  adult
50:  Bioresponse
51:  PhishingWebsites
52:  GesturePhaseSegmentationProcessed
53:  cylinder-bands
54:  dresses-sales
55:  numerai28.6
56:  texture
57:  connect-4
58:  dna
59:  churn
60:  Devnagari-Script
61:  CIFAR_10
6

In [3]:
data_path = '../data/synth_moa_arff'
datasets_arff = sorted(os.listdir(data_path))

In [4]:
for i,dataset in enumerate(datasets_arff):
    print(f'{i+1}/10: {dataset}')
    data = loadarff(f'{data_path}/{dataset}')
    df = pd.DataFrame(data[0])

    X = df.drop(columns=['class'])
    y = df['class']

    label_encoder = LabelEncoder()
    y_transformed = label_encoder.fit_transform(y)

    X_transformed = preprocessor.fit_transform(X)

    df = pd.concat([pd.DataFrame(X_transformed), pd.Series(y_transformed, name = 'class')], axis = 1)

    df.to_csv(f"../data/synth_moa_csv/{dataset[:-5]}.csv", index = False)

1/10: HyperplaneFaster.arff
2/10: HyperplaneSlow.arff
3/10: LED.arff
4/10: LEDNoDrift.arff
5/10: RBFBlips.arff
6/10: RBFGradualRecurring.arff
7/10: RBFNoDrift.arff
8/10: RandomTreeRecurring.arff
9/10: SEASudden.arff
10/10: SEASuddenFaster.arff


In [2]:
data_path = '../data/real_moa_arff'
datasets_arff = sorted(os.listdir(data_path))

In [18]:
target_class = {
    'airlines.arff': 'Delay',
    'covtypeNorm.arff': 'class',
    'elecNormNew.arff': 'class',
    'poker-lsn.arff': 'class'
}

for i,dataset in enumerate(datasets_arff):
    print(f'{i+1}/4: {dataset}')
    data = loadarff(f'{data_path}/{dataset}')
    df = pd.DataFrame(data[0])

    X = df.drop(columns=[target_class[dataset]])
    y = df[target_class[dataset]]

    label_encoder = LabelEncoder()
    y_transformed = label_encoder.fit_transform(y)

    X_transformed = preprocessor.fit_transform(X)

    df = pd.concat([pd.DataFrame(X_transformed), pd.Series(y_transformed, name = 'class')], axis = 1)

    df.to_csv(f"../data/real_moa_csv/{dataset[:-5]}.csv", index = False)

1/4: airlines.arff
2/4: covtypeNorm.arff
3/4: elecNormNew.arff
4/4: poker-lsn.arff


In [None]:
n_features_before_enconding = []
n_classes = []
n_instances = []
n_features_after_encoding = []
names = []

for i, (key, item) in enumerate(openml.datasets.list_datasets(tag='OpenML-CC18').items()):
    print(f"{i}:  {item['name']}")
    if item['name'] not in ('CIFAR_10', 'Devnagari-Script', 'Fashion-MNIST', 'mnist_784'):

        data = fetch_openml(data_id = item['did'])
        X = data['data']
        y = data['target']

        df = pd.concat([X, y], axis = 1)
        df = df.rename(columns = {data['target_names'][0]: 'y_target_class'})
        df = df.dropna()
        df = df.sample(frac=1).reset_index(drop=True)

        if df.shape[0] > 0:
            X_transformed = preprocessor.fit_transform(df.drop(columns=['y_target_class']))

            names.append(item['name'])
            n_features_before_enconding.append(X.shape[1])
            n_classes.append(y.nunique())
            n_instances.append(df.shape[0])
            n_features_after_encoding.append(X_transformed.shape[1])


df_cc18 = pd.DataFrame({
    'Dataset': names,
    'Size': n_instances,
    'Features before encoding': n_features_before_enconding,
    'Features after encoding': n_features_after_encoding,
    'Number of classes': n_classes
})

  for i, (key, item) in enumerate(openml.datasets.list_datasets(tag='OpenML-CC18').items()):


0:  kr-vs-kp
1:  letter
2:  balance-scale
3:  mfeat-factors
4:  mfeat-fourier
5:  breast-w
6:  mfeat-karhunen
7:  mfeat-morphological
8:  mfeat-zernike
9:  cmc
10:  optdigits
11:  credit-approval
12:  credit-g
13:  pendigits
14:  diabetes
15:  sick
16:  spambase
17:  splice
18:  tic-tac-toe
19:  vehicle
20:  electricity
21:  satimage
22:  eucalyptus
23:  isolet
24:  vowel
25:  analcatdata_authorship
26:  analcatdata_dmft
27:  mnist_784
28:  pc4
29:  pc3
30:  jm1
31:  kc2
32:  kc1
33:  pc1
34:  bank-marketing
35:  banknote-authentication
36:  blood-transfusion-service-center
37:  cnae-9
38:  first-order-theorem-proving
39:  har
40:  ilpd
41:  madelon
42:  nomao
43:  ozone-level-8hr
44:  phoneme
45:  qsar-biodeg
46:  wall-robot-navigation
47:  semeion
48:  wdbc
49:  adult
50:  Bioresponse
51:  PhishingWebsites
52:  GesturePhaseSegmentationProcessed
53:  cylinder-bands
54:  dresses-sales
55:  numerai28.6
56:  texture
57:  connect-4
58:  dna
59:  churn
60:  Devnagari-Script
61:  CIFAR_10
6

In [19]:
df_cc18['Dataset'] = df_cc18['Dataset'].apply(lambda x: x.lower())
df_cc18 = df_cc18.sort_values('Dataset')

In [20]:
df_cc18.to_latex(index=False)

'\\begin{tabular}{lrrrr}\n\\toprule\nDataset & Size & Features before encoding & Features after encoding & Number of classes \\\\\n\\midrule\nadult & 45222 & 14 & 104 & 2 \\\\\nanalcatdata_authorship & 841 & 70 & 70 & 4 \\\\\nanalcatdata_dmft & 797 & 4 & 21 & 6 \\\\\nbalance-scale & 625 & 4 & 4 & 3 \\\\\nbank-marketing & 45211 & 16 & 51 & 2 \\\\\nbanknote-authentication & 1372 & 4 & 4 & 2 \\\\\nbioresponse & 3751 & 1776 & 1776 & 2 \\\\\nblood-transfusion-service-center & 748 & 4 & 4 & 2 \\\\\nbreast-w & 683 & 9 & 9 & 2 \\\\\ncar & 1728 & 6 & 21 & 4 \\\\\nchurn & 5000 & 20 & 33 & 2 \\\\\nclimate-model-simulation-crashes & 540 & 18 & 18 & 2 \\\\\ncmc & 1473 & 9 & 24 & 3 \\\\\ncnae-9 & 1080 & 856 & 856 & 9 \\\\\nconnect-4 & 67557 & 42 & 126 & 3 \\\\\ncredit-approval & 653 & 15 & 46 & 2 \\\\\ncredit-g & 1000 & 20 & 61 & 2 \\\\\ncylinder-bands & 277 & 37 & 133 & 2 \\\\\ndiabetes & 768 & 8 & 8 & 2 \\\\\ndna & 3186 & 180 & 360 & 3 \\\\\ndresses-sales & 99 & 12 & 98 & 2 \\\\\nelectricity & 453

In [None]:
data_path = '../data/real_moa_arff'
datasets_arff = sorted(os.listdir(data_path))

n_features_before_enconding = []
n_classes = []
n_instances = []
n_features_after_encoding = []
names = []

target_class = {
    'airlines.arff': 'Delay',
    'covtypeNorm.arff': 'class',
    'elecNormNew.arff': 'class',
    'poker-lsn.arff': 'class'
}

for i,dataset in enumerate(datasets_arff):
    print(f'{i+1}/4: {dataset}')
    data = loadarff(f'{data_path}/{dataset}')
    df = pd.DataFrame(data[0])

    X = df.drop(columns=[target_class[dataset]])
    y = df[target_class[dataset]]

    label_encoder = LabelEncoder()
    y_transformed = label_encoder.fit_transform(y)

    X_transformed = preprocessor.fit_transform(X)

    names.append(dataset.replace('.arff', ''))
    n_features_before_enconding.append(X.shape[1])
    n_classes.append(y.nunique())
    n_instances.append(df.shape[0])
    n_features_after_encoding.append(X_transformed.shape[1])

df_real_moa = pd.DataFrame({
    'Dataset': names,
    'Size': n_instances,
    'Features before encoding': n_features_before_enconding,
    'Features after encoding': n_features_after_encoding,
    'Number of classes': n_classes
})

1/4: airlines.arff
2/4: covtypeNorm.arff
3/4: elecNormNew.arff
4/4: poker-lsn.arff


In [33]:
df_real_moa

Unnamed: 0,Dataset,Size,Features before encoding,Features after encoding,Number of classes
0,airlines,539383,7,614,2
1,covtypeNorm,581012,54,98,7
2,elecNormNew,45312,8,14,2
3,poker-lsn,829201,10,25,10


In [None]:
def replace_names(name):
    dict_names = {
        'covtypeNorm': 'Covertype',
        'elecNormNew': 'Electricity',
        'poker-lsn': 'Poker',
        'airlines': 'Airlines'
    }
    return dict_names[name]

In [35]:
df_real_moa['Dataset'] = df_real_moa['Dataset'].apply(replace_names)

In [36]:
df_real_moa

Unnamed: 0,Dataset,Size,Features before encoding,Features after encoding,Number of classes
0,Airlines,539383,7,614,2
1,Covertype,581012,54,98,7
2,Electricity,45312,8,14,2
3,Poker,829201,10,25,10


In [38]:
df_real_moa.to_latex()

'\\begin{tabular}{llrrrr}\n\\toprule\n & Dataset & Size & Features before encoding & Features after encoding & Number of classes \\\\\n\\midrule\n0 & Airlines & 539383 & 7 & 614 & 2 \\\\\n1 & Covertype & 581012 & 54 & 98 & 7 \\\\\n2 & Electricity & 45312 & 8 & 14 & 2 \\\\\n3 & Poker & 829201 & 10 & 25 & 10 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [3]:
data_path = '../data/synth_moa_arff'
datasets_arff = sorted(os.listdir(data_path))

n_features_before_enconding = []
n_classes = []
n_instances = []
n_features_after_encoding = []
names = []

for i,dataset in enumerate(datasets_arff):
    print(f'{i+1}/10: {dataset}')
    data = loadarff(f'{data_path}/{dataset}')
    df = pd.DataFrame(data[0])

    X = df.drop(columns=['class'])
    y = df['class']

    label_encoder = LabelEncoder()
    y_transformed = label_encoder.fit_transform(y)

    X_transformed = preprocessor.fit_transform(X)

    names.append(dataset.replace('.arff', ''))
    n_features_before_enconding.append(X.shape[1])
    n_classes.append(y.nunique())
    n_instances.append(df.shape[0])
    n_features_after_encoding.append(X_transformed.shape[1])


df_synth_moa = pd.DataFrame({
    'Dataset': names,
    'Size': n_instances,
    'Features before encoding': n_features_before_enconding,
    'Features after encoding': n_features_after_encoding,
    'Number of classes': n_classes
})

1/10: HyperplaneFaster.arff
2/10: HyperplaneSlow.arff
3/10: LED.arff
4/10: LEDNoDrift.arff
5/10: RBFBlips.arff
6/10: RBFGradualRecurring.arff
7/10: RBFNoDrift.arff
8/10: RandomTreeRecurring.arff
9/10: SEASudden.arff
10/10: SEASuddenFaster.arff


In [4]:
df_synth_moa

Unnamed: 0,Dataset,Size,Features before encoding,Features after encoding,Number of classes
0,HyperplaneFaster,100000,10,10,2
1,HyperplaneSlow,100000,10,10,2
2,LED,100000,24,48,10
3,LEDNoDrift,100000,24,48,10
4,RBFBlips,100000,20,20,4
5,RBFGradualRecurring,100000,20,20,4
6,RBFNoDrift,100000,10,10,2
7,RandomTreeRecurring,100000,10,30,4
8,SEASudden,100000,3,3,2
9,SEASuddenFaster,100000,3,3,2


In [5]:
def define_drifts(name):
    drifts = {
        'HyperplaneFaster': 'incremental',
        'HyperplaneSlow': 'incremental',
        'LED': 'mixed',
        'LEDNoDrift': 'none',
        'RBFBlips': 'blips',
        'RBFGradualRecurring': 'gradual',
        'RBFNoDrift': 'none',
        'RandomTreeRecurring': 'recurring',
        'SEASudden': 'sudden',
        'SEASuddenFaster': 'sudden'
    }
    return drifts[name]

In [6]:
df_synth_moa['Drift type'] = df_synth_moa['Dataset'].apply(define_drifts)
df_synth_moa

Unnamed: 0,Dataset,Size,Features before encoding,Features after encoding,Number of classes,Drift type
0,HyperplaneFaster,100000,10,10,2,incremental
1,HyperplaneSlow,100000,10,10,2,incremental
2,LED,100000,24,48,10,mixed
3,LEDNoDrift,100000,24,48,10,none
4,RBFBlips,100000,20,20,4,blips
5,RBFGradualRecurring,100000,20,20,4,gradual
6,RBFNoDrift,100000,10,10,2,none
7,RandomTreeRecurring,100000,10,30,4,recurring
8,SEASudden,100000,3,3,2,sudden
9,SEASuddenFaster,100000,3,3,2,sudden


In [16]:
def define_acronym(name):
    acronyms = {
        'HyperplaneFaster': 'Hyp_F',
        'HyperplaneSlow': 'Hyp_S',
        'LED': 'LED_M',
        'LEDNoDrift': 'LED_ND',
        'RBFBlips': 'RBF_B',
        'RBFGradualRecurring': 'RBG_GR',
        'RBFNoDrift': 'RBF_ND',
        'RandomTreeRecurring': 'Tree_S',
        'SEASudden': 'SEA_S',
        'SEASuddenFaster': 'SEA_F'
    }
    return acronyms[name]

In [20]:
df_synth_moa['Acronym'] = df_synth_moa['Dataset'].apply(define_acronym)
df_synth_moa = df_synth_moa[['Dataset', 'Acronym', 'Size', 'Features before encoding', 'Features after encoding', 'Number of classes', 'Drift type']]


In [21]:
df_synth_moa.to_latex()

'\\begin{tabular}{lllrrrrl}\n\\toprule\n & Dataset & Acronym & Size & Features before encoding & Features after encoding & Number of classes & Drift type \\\\\n\\midrule\n0 & HyperplaneFaster & Hyp_F & 100000 & 10 & 10 & 2 & incremental \\\\\n1 & HyperplaneSlow & Hyp_S & 100000 & 10 & 10 & 2 & incremental \\\\\n2 & LED & LED_M & 100000 & 24 & 48 & 10 & mixed \\\\\n3 & LEDNoDrift & LED_ND & 100000 & 24 & 48 & 10 & none \\\\\n4 & RBFBlips & RBF_B & 100000 & 20 & 20 & 4 & blips \\\\\n5 & RBFGradualRecurring & RBG_GR & 100000 & 20 & 20 & 4 & gradual \\\\\n6 & RBFNoDrift & RBF_ND & 100000 & 10 & 10 & 2 & none \\\\\n7 & RandomTreeRecurring & Tree_S & 100000 & 10 & 30 & 4 & recurring \\\\\n8 & SEASudden & SEA_S & 100000 & 3 & 3 & 2 & sudden \\\\\n9 & SEASuddenFaster & SEA_F & 100000 & 3 & 3 & 2 & sudden \\\\\n\\bottomrule\n\\end{tabular}\n'