In [1]:
!pip install arff



In [2]:
import pandas as pd
import numpy as np
import arff
from tqdm.auto import tqdm
from glob import glob
import os
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder


In [3]:
datasets = [y for x in os.walk("../.") for y in glob(os.path.join(x[0], '*.zip'))]

len(datasets)

37

In [4]:
import numpy as np


def prepare_data(X_original, max_nbr_values, max_nbr_values_cat):
    X = np.copy(X_original)
    feature_values = dict()
    n_features = X.shape[1]
    is_categorical_feature = np.full_like(np.zeros(n_features, dtype=bool), False)
    for feature in range(n_features):
        values = np.unique(X[:, feature])
        vals = None
        if len(values) > max_nbr_values:
            _, vals = np.histogram(values, bins=max_nbr_values)
            values = [(vals[i] + vals[i + 1]) / 2 for i in range(len(vals) - 1)]
        feature_values[feature] = values

        if len(values) <= max_nbr_values_cat:
            is_categorical_feature[feature] = True

            if vals is not None:
                for original_val_idx in range(X.shape[0]):
                    for min, max, binned_val in zip(vals[:-1], vals[1:], values):
                        original_val = X[original_val_idx, feature]
                        if min < original_val and max > original_val:
                            X[original_val_idx, feature] = binned_val
                            break


    return feature_values, is_categorical_feature, X

In [31]:
for dataset in tqdm(datasets):
    df = pd.read_csv(dataset)
    
    ct = ColumnTransformer([
                ('std_scaler', StandardScaler(), make_column_selector(dtype_include=['int', 'float']))],
                remainder='passthrough', verbose_feature_names_out=False, sparse_threshold=0, n_jobs=os.cpu_count())
    
    if "_y." in dataset:
        classe = df[[df.columns[-1]]].astype(str)
        df = df.select_dtypes(exclude=['object'])
        df["classe"] = classe
    else: 
        df = df.select_dtypes(exclude=['object'])

    df = pd.DataFrame(ct.fit_transform(df))

    if "_y." in dataset:
        df[[df.columns[-1]]] = df[[df.columns[-1]]].astype(str)
        df[[df.columns[-1]]].to_csv(dataset[3:].replace(".zip", ".csv"))

    classes = "\r\n@attribute class {"
    for value in df[df.columns[-1]].unique():
        classes+= f"'{value}',"

    #df.drop(columns=df.columns[-1], inplace=True)
        
    _, _, X = prepare_data(df.values[:, :-1], 20, 20)

    arff.dump(dataset[3:].replace(".zip", ".arff"), np.hstack([df.values[:, :-1], df.values[:, -1].reshape(-1, 1)]), relation="none", names=df.columns)

    file = open(dataset[3:].replace(".zip", ".arff"), 'a')

    file.write(classes[:-1]+"}")

  0%|          | 0/19 [00:00<?, ?it/s]

In [27]:
np.hstack([df.values[:, :-1], df.values[:, -1].reshape(-1, 1)])

array([[-0.9006811702978088, 1.019004351971607, -1.3402265266227624,
        '-1.3154442950077403'],
       [-1.1430169111851105, -0.13197947932162468, -1.3402265266227624,
        '-1.3154442950077403'],
       [-1.3853526520724133, 0.32841405319566835, -1.3970639535363654,
        '-1.3154442950077403'],
       [-1.5065205225160652, 0.09821728693702184, -1.2833890997091593,
        '-1.3154442950077403'],
       [-1.0218490407414595, 1.2492011182302534, -1.3402265266227624,
        '-1.3154442950077403'],
       [-0.537177558966854, 1.939791417006192, -1.1697142458819532,
        '-1.0521799264271385'],
       [-1.5065205225160652, 0.7888075857129604, -1.3402265266227624,
        '-1.1838121107174393'],
       [-1.0218490407414595, 0.7888075857129604, -1.2833890997091593,
        '-1.3154442950077403'],
       [-1.7488562634033669, -0.3621762455802712, -1.3402265266227624,
        '-1.3154442950077403'],
       [-1.1430169111851105, 0.09821728693702184, -1.2833890997091593,
        '

In [7]:
datasets[0][4:]

'\\real\\adult_y.zip'