In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import os
from glob import glob

In [2]:
datasets = [y for x in os.walk("../original datasets/") for y in glob(os.path.join(x[0], '*.csv'))] \
           + [y for x in os.walk("../original datasets/") for y in glob(os.path.join(x[0], '*.txt'))]

datasets.sort(reverse=True)


In [10]:
datasets = [x.replace("\\", "/") for x in datasets if "preprocessed" not in x]

In [11]:
def compas_preproc(df: pd.DataFrame):
    df = df.copy(deep=True)

    df["dob"] = df["dob"].apply(lambda x: int(x.replace("-", "")))
    df["compas_screening_date"] = df["compas_screening_date"].apply(lambda x: int(x.replace("-", "")))
    df["c_jail_in"] = df["c_jail_in"].apply(lambda x: int(x[:10].replace("-", "")) if str(x) != "nan" else x)
    df["c_jail_out"] = df["c_jail_out"].apply(lambda x: int(x[:10].replace("-", "")) if str(x) != "nan" else x)
    df["c_offense_date"] = df["c_offense_date"].apply(lambda x: int(x.replace("-", "")) if str(x) != "nan" else x)
    df["c_arrest_date"] = df["c_arrest_date"].apply(lambda x: int(x.replace("-", "")) if str(x) != "nan" else x)
    df["screening_date"] = df["screening_date"].apply(lambda x: int(x.replace("-", "")) if str(x) != "nan" else x)
    df["v_screening_date"] = df["v_screening_date"].apply(lambda x: int(x.replace("-", "")) if str(x) != "nan" else x)
    df["in_custody"] = df["in_custody"].apply(lambda x: int(x.replace("-", "")) if str(x) != "nan" else x)
    df["out_custody"] = df["out_custody"].apply(lambda x: int(x.replace("-", "")) if str(x) != "nan" else x)
    df["r_offense_date"] = df["r_offense_date"].apply(lambda x: int(x.replace("-", "")) if str(x) != "nan" else x)
    df["r_jail_in"] = df["r_jail_in"].apply(lambda x: int(x.replace("-", "")) if str(x) != "nan" else x)
    df["r_jail_out"] = df["r_jail_out"].apply(lambda x: int(x.replace("-", "")) if str(x) != "nan" else x)

    df = df.drop(columns=["name", "first", "last", "c_case_number", "r_case_number", "vr_case_number", "vr_offense_date",
                      "vr_charge_desc", "r_charge_desc", "c_charge_desc"])

    return df

In [12]:
has_y_tsv_custom_prep = { #if syntetic -> no header?
    '../original datasets/synthetic/zelnik6.csv': (2, False, None),
    '../original datasets/synthetic/zelnik5.csv': (2, False, None),
    '../original datasets/synthetic/triangle2.csv': (2, False, None),
    '../original datasets/synthetic/triangle1.csv': (2, False, None),
    '../original datasets/synthetic/tetra.csv': (3, False, None),
    '../original datasets/synthetic/s-set2.csv': (2, False, None),
    '../original datasets/synthetic/s-set1.csv': (2, False, None),
    '../original datasets/synthetic/longsquare.csv': (2, False, None),
    '../original datasets/synthetic/dim512.txt': ('../original datasets/synthetic/dim512.pa.txt', True, None),
    '../original datasets/synthetic/dim256.txt': ('../original datasets/synthetic/dim256.pa.txt', True, None),
    '../original datasets/synthetic/dim128.txt': ('../original datasets/synthetic/dim128.pa.txt', True, None),
    '../original datasets/synthetic/dim1024.txt': ('../original datasets/synthetic/dim1024.pa.txt', True, None),
    '../original datasets/synthetic/dim064.txt': ('../original datasets/synthetic/dim064.pa.txt', True, None),
    '../original datasets/synthetic/dim032.txt': ('../original datasets/synthetic/dim032.pa.txt', True, None),
    '../original datasets/synthetic/cure-t2-4k.csv': (2, False, None),
    '../original datasets/synthetic/cure-t1-2000n-2D.csv': (2, False, None),
    '../original datasets/synthetic/cure-t0-2000n-2D.csv': (2, False, None),
    '../original datasets/synthetic/aggregation.csv': (2, False, None),
    '../original datasets/synthetic/2d-d31.csv': (2, False, None),
    '../original datasets/synthetic/2d-4c.csv': (2, False, None),
    '../original datasets/synthetic/2d-4c-no9.csv': (2, False, None),
    '../original datasets/synthetic/2d-4c-no4.txt': (2, False, None),
    '../original datasets/synthetic/2d-3c-no123.csv': (2, False, None),
    '../original datasets/synthetic/2d-20c-no0.csv': (2, False, None),
    '../original datasets/synthetic/2d-10c.csv': (2, False, None),

    '../original datasets/real/wine.csv': (None, False, None),
    '../original datasets/real/wdbc.csv': ("diagnosis", False, None),
    '../original datasets/real/vehicle.csv': ("CLASS", False, None),
    '../original datasets/real/titanic.csv': ("Survived", False, None),
    '../original datasets/real/iris.csv': ("class", False, None),
    '../original datasets/real/ionosphere.csv': ("class", False, None),
    '../original datasets/real/home.csv': ("in_sf", False, None),
    '../original datasets/real/german_credit.csv': ("default", False, None),
    '../original datasets/real/fico.csv': ("RiskPerformance", False, None),
    '../original datasets/real/diabetes.csv': ("Outcome", False, None),
    '../original datasets/real/compas-scores-two-years.csv': ("two_year_recid", False, compas_preproc),
    '../original datasets/real/churn.csv': ("churn", False, None),
    '../original datasets/real/bank.csv': ("give_credit", False, None),
    '../original datasets/real/avila.csv': ("class", False, None),
    '../original datasets/real/adult.csv': (" class", False, None),
}

In [13]:
for k, v in has_y_tsv_custom_prep.items():
    if k not in datasets:
        print(f"Errore {k}")

for el in datasets:
    if el not in has_y_tsv_custom_prep and ".pa" not in el:
        print(f"Errore2 {el}")

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
for dataset_path, (y_col, is_tsv, prepr_f) in tqdm(has_y_tsv_custom_prep.items()):
    dataset_name = dataset_path.split("/")[-1].replace(".csv", "").replace(".txt", "")
    is_real = "real" in dataset_path

    header = "infer" if is_real else None

    if is_tsv:
        df = pd.read_csv(dataset_path, header=header, sep="\s+")
    else:
        df = pd.read_csv(dataset_path, header=header)

    if type(y_col) == str and y_col.endswith(".pa.txt"):
        df_pa = pd.read_csv(y_col, header=header)
        df["target_class_pa"] = df_pa[df_pa.columns[-1]]
        y_col = "target_class_pa"

    if prepr_f is not None:
        df = prepr_f(df)

    if y_col is not None:
        target_class_df = pd.DataFrame(LabelEncoder().fit_transform(df[y_col]))
        df.drop(columns=y_col, inplace=True)
        target_class_df.to_csv(f"{'real' if is_real else 'synthetic'}_preprocessed/{dataset_name}.y.csv", index=False)

    df.to_csv(f"{'real' if is_real else 'synthetic'}_preprocessed/{dataset_name}.csv", index=False)

  0%|          | 0/40 [00:00<?, ?it/s]