In [3]:
import pickle
import pandas as pd
import glob, os

from pandas.api.types import is_string_dtype

In [35]:
def get_datasets_path(datasets_path, find_by):
    datasets_path_dict = {}
    for root, dirs, files in os.walk(datasets_path):
        for file in files:
            if file.endswith(find_by):
                path = os.path.join(root, file)
                uuid = path.split("/")[-1].split(".")[0]
                datasets_path_dict[uuid] = path
    return datasets_path_dict

def read_dataset(path):
    with open(path, 'rb') as picklefile:
        dataset = pickle.load(picklefile)
    return dataset

def get_meta_information(dataset_path):
    dataset_id = int(dataset_path.split("/")[-1].split("_")[-1].split(".")[0])
    print(f"Metadata for: dataset_{dataset_id} ... ")
    df, df_class, check, features = read_dataset(dataset_path)

    number_of_features = df.shape[1]
    number_of_exemples = df.shape[0]
    number_of_class = len(df_class.unique())
    
    minority = round(100*df_class.value_counts(sort=True, ascending=True).iloc[0]/number_of_exemples, 2)
    majority = round(100*df_class.value_counts(sort=True, ascending=False).iloc[0]/number_of_exemples, 2)

    # check = [is_string_dtype(df.iloc[i]) or isinstance(df.dtypes[i], pd.CategoricalDtype) for i in range(number_of_features)]
    categorical_features = sum(check)
    numerical_features = categorical_features - number_of_features
    
    print("\r [Done]")

    return {
        "OpenML ID": dataset_id,
        "Number of Examples": number_of_exemples,
        "Number of Features": number_of_features,
        "Number of Categorical Features": categorical_features,
        "Number of class": number_of_class,
        "Majority Class %": majority,
        "Minority Class %": minority
    }

In [5]:
datasets_parh = get_datasets_path("../../../datasets/", ".pkl")
get_meta_information(datasets_parh["dataset_24"])

Metadata for:  ../../../datasets/training/dataset_24.pkl  ...
 [Done]


{'Number of Examples': 8124,
 'Number of Features': 22,
 'Number of Categorical Features': 22,
 'Number of class': 2,
 'Majority Class %': 51.8,
 'Minority Class %': 48.2}

In [6]:
df = pd.read_csv("../pipeline_experiment_result.csv")
df

  df = pd.read_csv("../pipeline_experiment_result.csv")


Unnamed: 0,seed_i,config_id,fold,config_hash,duration,start_time,end_time,status,seed,budget,...,classifier:sgd:power_t,feature_preprocessor:nystroem_sampler:coef0,feature_preprocessor:nystroem_sampler:degree,classifier:multinomial_nb:alpha,classifier:multinomial_nb:fit_prior,classifier:sgd:l1_ratio,dataset,data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__,data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__,data_preprocessor:feature_type:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction
0,0,0,1,34a76cf2da6c41f8646867818eea56d6,1.913694,1.680580e+09,1.680580e+09,StatusType.SUCCESS,0,0.0,...,,,,,,,dataset_40985,,,
1,0,0,2,34a76cf2da6c41f8646867818eea56d6,1.911491,1.680580e+09,1.680580e+09,StatusType.SUCCESS,0,0.0,...,,,,,,,dataset_40985,,,
2,0,0,3,34a76cf2da6c41f8646867818eea56d6,2.011853,1.680580e+09,1.680580e+09,StatusType.SUCCESS,0,0.0,...,,,,,,,dataset_40985,,,
3,0,0,4,34a76cf2da6c41f8646867818eea56d6,2.025221,1.680580e+09,1.680580e+09,StatusType.SUCCESS,0,0.0,...,,,,,,,dataset_40985,,,
4,0,0,5,34a76cf2da6c41f8646867818eea56d6,2.001899,1.680580e+09,1.680580e+09,StatusType.SUCCESS,0,0.0,...,,,,,,,dataset_40985,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1037065,0,499,6,19a650aed428ed8e5fe1619c61bc7118,6.627506,1.680869e+09,1.680869e+09,StatusType.SUCCESS,0,0.0,...,,,,,,,dataset_1161,,,
1037066,0,499,7,19a650aed428ed8e5fe1619c61bc7118,6.571584,1.680869e+09,1.680869e+09,StatusType.SUCCESS,0,0.0,...,,,,,,,dataset_1161,,,
1037067,0,499,8,19a650aed428ed8e5fe1619c61bc7118,6.475392,1.680869e+09,1.680869e+09,StatusType.SUCCESS,0,0.0,...,,,,,,,dataset_1161,,,
1037068,0,499,9,19a650aed428ed8e5fe1619c61bc7118,6.478533,1.680869e+09,1.680869e+09,StatusType.SUCCESS,0,0.0,...,,,,,,,dataset_1161,,,


In [36]:
datasets = df["dataset"].unique()
meta_information = [get_meta_information(datasets_parh[i]) for i in datasets]

Metadata for: dataset_40985 ... 
 [Done]
Metadata for: dataset_1501 ... 
 [Done]
Metadata for: dataset_1479 ... 
 [Done]
Metadata for: dataset_1530 ... 
 [Done]
Metadata for: dataset_40680 ... 
 [Done]
Metadata for: dataset_1480 ... 
 [Done]
Metadata for: dataset_151 ... 
 [Done]
Metadata for: dataset_1528 ... 
 [Done]
Metadata for: dataset_311 ... 
 [Done]
Metadata for: dataset_949 ... 
 [Done]
Metadata for: dataset_934 ... 
 [Done]
Metadata for: dataset_40691 ... 
 [Done]
Metadata for: dataset_1532 ... 
 [Done]
Metadata for: dataset_1146 ... 
 [Done]
Metadata for: dataset_742 ... 
 [Done]
Metadata for: dataset_886 ... 
 [Done]
Metadata for: dataset_728 ... 
 [Done]
Metadata for: dataset_737 ... 
 [Done]
Metadata for: dataset_1553 ... 
 [Done]
Metadata for: dataset_1053 ... 
 [Done]
Metadata for: dataset_40704 ... 
 [Done]
Metadata for: dataset_1116 ... 
 [Done]
Metadata for: dataset_37 ... 
 [Done]
Metadata for: dataset_837 ... 
 [Done]
Metadata for: dataset_40705 ... 
 [Done]
Metada

In [39]:
meta_information_df = pd.DataFrame(meta_information).sort_values(by="OpenML ID", ascending=True)
meta_information_df

Unnamed: 0,OpenML ID,Number of Examples,Number of Features,Number of Categorical Features,Number of class,Majority Class %,Minority Class %
132,2,898,38,32,5,76.17,0.00
25,6,20000,16,0,26,4.07,3.67
44,11,625,4,0,3,46.08,7.84
147,15,699,9,0,2,65.52,34.48
169,23,1473,9,7,3,42.70,22.61
...,...,...,...,...,...,...,...
206,41991,270912,784,0,49,2.58,0.17
188,42193,5278,13,6,2,52.96,47.04
159,42206,595212,37,25,2,96.36,3.64
168,42343,82318,477,136,2,88.23,11.77


In [40]:
meta_information_df.to_csv("datasets_meta_data.csv")