In [1]:
import openml
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


dataset = openml.datasets.get_dataset(1483)
X, y, cat_indicator, names = dataset.get_data(dataset_format='array', target=dataset.default_target_attribute)
cat_indicator = np.asarray(cat_indicator)

ct = ColumnTransformer([
    ('encoder', OneHotEncoder(), np.where(cat_indicator)[0]),
    ('normalizer', StandardScaler(), np.where(~cat_indicator)[0])
], remainder='passthrough')

X.shape

(164860, 7)

In [2]:
np.unique(y, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([ 2973,  2848, 32710, 54480,  6168,  5210, 27244,  1706, 11779,
        18361,  1381]))

In [4]:
# We keep only calsses 2, 3, 6, 8, and 9

to_keep = np.in1d(y, [2, 3, 6, 8, 9])

X = X[to_keep]
y = y[to_keep]
print(X.shape, y.shape)

(144574, 7) (144574,)


In [6]:
shuffle = np.arange(X.shape[0])
np.random.shuffle(shuffle)
X = X[shuffle]
y = y[shuffle]

In [10]:
X_train, y_train = X[:500], y[:500]
X_test, y_test = X[1000:6000], y[1000:6000]

In [11]:
np.savetxt('X_train.csv', X_train)
np.savetxt('y_train.csv', y_train)
np.savetxt('X_test.csv', X_test)
np.savetxt('y_test.csv', y_test)