In [2]:
import openml
from sklearn.datasets import fetch_openml
import pandas as pd

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

import numpy as np

from scipy.io.arff import loadarff

import os

In [16]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), make_column_selector(dtype_exclude=['object', 'category'])),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output = False), make_column_selector(dtype_include=['object', 'category']))
])

In [36]:
shapes = []
for i, (key, item) in enumerate(openml.datasets.list_datasets(tag='OpenML-CC18').items()):
    print(f"{i}:  {item['name']}")
    if item['name'] not in ('CIFAR_10', 'Devnagari-Script', 'Fashion-MNIST', 'mnist_784'):

        data = fetch_openml(data_id = item['did'])
        X = data['data']
        y = data['target']

        feature_names = data['feature_names']
        target_names = data['target_names']

        df = pd.concat([X, y], axis = 1)
        df = df.rename(columns = {data['target_names'][0]: 'y_target_class'})
        df = df.dropna()
        df = df.sample(frac=1).reset_index(drop=True)

        if df.shape[0] > 0:
            label_encoder = LabelEncoder()
            df['y_target_class'] = label_encoder.fit_transform(df['y_target_class'])

            X_transformed = preprocessor.fit_transform(df.drop(columns=['y_target_class']))
            y_transformed = df['y_target_class']

            df = pd.concat([pd.DataFrame(X_transformed), y_transformed], axis = 1)

            df.to_csv(f"../data/CC18/{item['name']}.csv", index = False)

            shapes.append(df.shape)

  for i, (key, item) in enumerate(openml.datasets.list_datasets(tag='OpenML-CC18').items()):


0:  kr-vs-kp
1:  letter
2:  balance-scale
3:  mfeat-factors
4:  mfeat-fourier
5:  breast-w
6:  mfeat-karhunen
7:  mfeat-morphological
8:  mfeat-zernike
9:  cmc
10:  optdigits
11:  credit-approval
12:  credit-g
13:  pendigits
14:  diabetes
15:  sick
16:  spambase
17:  splice
18:  tic-tac-toe
19:  vehicle
20:  electricity
21:  satimage
22:  eucalyptus
23:  isolet
24:  vowel
25:  analcatdata_authorship
26:  analcatdata_dmft
27:  mnist_784
28:  pc4
29:  pc3
30:  jm1
31:  kc2
32:  kc1
33:  pc1
34:  bank-marketing
35:  banknote-authentication
36:  blood-transfusion-service-center
37:  cnae-9
38:  first-order-theorem-proving
39:  har
40:  ilpd
41:  madelon
42:  nomao
43:  ozone-level-8hr
44:  phoneme
45:  qsar-biodeg
46:  wall-robot-navigation
47:  semeion
48:  wdbc
49:  adult
50:  Bioresponse
51:  PhishingWebsites
52:  GesturePhaseSegmentationProcessed
53:  cylinder-bands
54:  dresses-sales
55:  numerai28.6
56:  texture
57:  connect-4
58:  dna
59:  churn
60:  Devnagari-Script
61:  CIFAR_10
6

In [3]:
data_path = '../data/synth_moa_arff'
datasets_arff = sorted(os.listdir(data_path))

In [4]:
for i,dataset in enumerate(datasets_arff):
    print(f'{i+1}/10: {dataset}')
    data = loadarff(f'{data_path}/{dataset}')
    df = pd.DataFrame(data[0])

    X = df.drop(columns=['class'])
    y = df['class']

    label_encoder = LabelEncoder()
    y_transformed = label_encoder.fit_transform(y)

    X_transformed = preprocessor.fit_transform(X)

    df = pd.concat([pd.DataFrame(X_transformed), pd.Series(y_transformed, name = 'class')], axis = 1)

    df.to_csv(f"../data/synth_moa_csv/{dataset[:-5]}.csv", index = False)

1/10: HyperplaneFaster.arff
2/10: HyperplaneSlow.arff
3/10: LED.arff
4/10: LEDNoDrift.arff
5/10: RBFBlips.arff
6/10: RBFGradualRecurring.arff
7/10: RBFNoDrift.arff
8/10: RandomTreeRecurring.arff
9/10: SEASudden.arff
10/10: SEASuddenFaster.arff


In [17]:
data_path = '../data/real_moa_arff'
datasets_arff = sorted(os.listdir(data_path))

In [18]:
target_class = {
    'airlines.arff': 'Delay',
    'covtypeNorm.arff': 'class',
    'elecNormNew.arff': 'class',
    'poker-lsn.arff': 'class'
}

for i,dataset in enumerate(datasets_arff):
    print(f'{i+1}/4: {dataset}')
    data = loadarff(f'{data_path}/{dataset}')
    df = pd.DataFrame(data[0])

    X = df.drop(columns=[target_class[dataset]])
    y = df[target_class[dataset]]

    label_encoder = LabelEncoder()
    y_transformed = label_encoder.fit_transform(y)

    X_transformed = preprocessor.fit_transform(X)

    df = pd.concat([pd.DataFrame(X_transformed), pd.Series(y_transformed, name = 'class')], axis = 1)

    df.to_csv(f"../data/real_moa_csv/{dataset[:-5]}.csv", index = False)

1/4: airlines.arff
2/4: covtypeNorm.arff
3/4: elecNormNew.arff
4/4: poker-lsn.arff
