In [1]:
import pandas as pd
import numpy as np
import re
import pathlib
from glob import glob
from random import seed
from sklearn import model_selection, preprocessing
import dill

In [2]:
SEED = 1024

In [3]:
all_csvs = glob("../*.csv")
all_artifical_dataset_fns = [fn for fn in all_csvs if re.match(r".*\/\d+.csv", fn)]

In [4]:
all_artifical_dataset_fns = [pathlib.Path(fn) for fn in all_artifical_dataset_fns]

In [8]:
for i, _ in enumerate(all_artifical_dataset_fns):
    df = pd.read_csv(all_artifical_dataset_fns[i])
    categorical = df.columns
    categorical_idx = [df.columns.get_loc(cat) for cat in categorical]
    ds_name = all_artifical_dataset_fns[i].stem.strip()
    for col in categorical:
        df[col] = df[col].astype("category")
    
    df = df.apply(lambda x: x.cat.codes if x.dtype == "category" else x)
    
    train_df, val_df = model_selection.train_test_split(df, train_size=0.8, random_state=SEED)
    val_df, test_df = model_selection.train_test_split(val_df, train_size=0.5, random_state=SEED)
    train_df.to_csv(f"../datasets_for_comparison/{ds_name}_train.csv", index_label="idx")
    val_df.to_csv(f"../datasets_for_comparison/{ds_name}_val.csv", index_label="idx")
    val_df.to_csv(f"../datasets_for_comparison/{ds_name}_test.csv", index_label="idx")
    
    oh_enc = preprocessing.OneHotEncoder(sparse_output=False, handle_unknown="ignore" , drop="if_binary")
    oh_enc.fit(df.to_numpy()[:, categorical_idx[:-1]])
    
    with open(f"../datasets_for_comparison/{ds_name}_one_hot_encoder.pkl", "wb") as f:
        dill.dump(oh_enc, f)
        
    pd.Series(categorical).to_csv(f"../datasets_for_comparison/{ds_name}_categorical_cols.csv", header=None)
    
    num_cats = df.nunique()
    num_cats.to_csv(f"../datasets_for_comparison/{ds_name}_num_cats.csv", header=None)