In [1]:
import logging

import numpy as np
import pandas as pd

import common_datasets.binary_classification as binclas
import common_datasets.regression as regr

__all__ = ['binclas_datasets', 'regr_datasets']

logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')

# determining the binary classification datasets containing grid features

logging.info("querying the filtered classification datasets")

datasets = binclas.get_filtered_data_loaders(n_bounds=(1, 1000),
                                                n_minority_bounds=(5, 1000),
                                                n_from_phenotypes=1)

logging.info("ranking the datasets")

summary = binclas.get_summary_pdf()
summary = summary[summary['data_loader_function'].isin(datasets)].copy()

summary['n_grid'] = summary[['grid', 'n_feature_uniques']]\
    .apply(lambda x: np.sum(np.array(x['grid']) & (np.array(x['n_feature_uniques']) > 2)), axis=1)

binclas_datasets = summary[summary['n_grid'] > 0].copy()

# dropping the datasets which are separable
binclas_datasets = binclas_datasets[~binclas_datasets['name'].isin(['iris0', 'dermatology_6'])]
binclas_datasets['total'] = binclas_datasets['n'] * binclas_datasets['n_col']

binclas_datasets = binclas_datasets.sort_values('total')
binclas_datasets = binclas_datasets.iloc[:20, :]

binclas_datasets = binclas_datasets[['name',
                                        'citation_key',
                                        'n_col',
                                        'n',
                                        'n_minority',
                                        'n_grid',
                                        'data_loader_function']].reset_index(drop=True)

logging.info("binary classification datasets prepared")

# determining the regression datasets containing grid features

logging.info("querying the filtered regression datasets")

datasets = regr.get_filtered_data_loaders(n_bounds=(1, 2000),
                                            n_from_phenotypes=1)

logging.info("ranking the datasets")

summary = regr.get_summary_pdf()
summary = summary[summary['data_loader_function'].isin(datasets)].copy()

summary['n_grid'] = summary[['grid', 'n_feature_uniques']]\
    .apply(lambda x: np.sum(np.array(x['grid']) & (np.array(x['n_feature_uniques']) > 2)), axis=1)

regr_datasets = summary[summary['n_grid'] > 0].copy()
regr_datasets['total'] = regr_datasets['n'] * regr_datasets['n_col']

regr_datasets = regr_datasets.sort_values('total')
regr_datasets = regr_datasets.iloc[:20, :]

regr_datasets = regr_datasets[['name',
                                'citation_key',
                                'n_col',
                                'n',
                                'n_grid',
                                'grid',
                                'data_loader_function']].reset_index(drop=True)

logging.info("regression datasets prepared")


2023-11-19 09:44:17 INFO     querying the filtered classification datasets
2023-11-19 09:44:17 INFO     ranking the datasets
2023-11-19 09:44:17 INFO     binary classification datasets prepared
2023-11-19 09:44:17 INFO     querying the filtered regression datasets
2023-11-19 09:44:17 INFO     ranking the datasets
2023-11-19 09:44:17 INFO     regression datasets prepared


In [2]:
binclas_datasets

Unnamed: 0,name,citation_key,n_col,n,n_minority,n_grid,data_loader_function
0,appendicitis,keel,7,106,21,7,<function load_appendicitis at 0x7f32c12ae200>
1,haberman,keel,3,306,81,3,<function load_haberman at 0x7f32c12add80>
2,new_thyroid1,keel,5,215,35,5,<function load_new_thyroid1 at 0x7f32c12adea0>
3,glass0,keel,9,214,70,9,<function load_glass0 at 0x7f32c1287880>
4,shuttle-6_vs_2-3,keel,9,230,10,9,<function load_shuttle_6_vs_2_3 at 0x7f32c12ad...
5,bupa,keel,6,345,145,6,<function load_bupa at 0x7f32c12ae560>
6,cleveland-0_vs_4,keel,13,177,13,10,<function load_cleveland_0_vs_4 at 0x7f32c12ad...
7,ecoli1,keel,7,336,77,5,<function load_ecoli1 at 0x7f32c1287130>
8,poker-9_vs_7,keel,10,244,8,10,<function load_poker_9_vs_7 at 0x7f32c12ad120>
9,monk-2,keel,6,432,204,4,<function load_monk_2 at 0x7f32c12ae3b0>


In [3]:
regr_datasets

Unnamed: 0,name,citation_key,n_col,n,n_grid,grid,data_loader_function
0,diabetes,keel,2,43,2,"[True, True]",<function load_diabetes at 0x7f32c12aff40>
1,o-ring,uci,6,23,4,"[False, True, False, True, True, True]",<function load_o_ring at 0x7f32c10dca60>
2,stock_portfolio_performance,uci,6,63,6,"[True, True, True, True, True, True]",<function load_stock_portfolio_performance at ...
3,wsn-ale,uci,5,107,4,"[False, True, True, True, True]",<function load_wsn_ale at 0x7f32c10dcb80>
4,daily-demand,uci,12,60,7,"[False, False, False, False, False, True, True...",<function load_daily_demand at 0x7f32c10dcaf0>
5,slump_test,krnn,9,103,9,"[True, True, True, True, True, True, True, Tru...",<function load_slump_test at 0x7f32c10dc790>
6,servo,uci,10,167,2,"[True, True, False, False, False, False, False...",<function load_servo at 0x7f32c10dcc10>
7,yacht_hydrodynamics,krnn,6,307,6,"[True, True, True, True, True, True]",<function load_yacht_hydrodynamics at 0x7f32c1...
8,autoMPG6,keel,5,392,5,"[True, True, True, True, True]",<function load_autoMPG6 at 0x7f32c10dc0d0>
9,excitation_current,uci,4,557,4,"[True, True, True, True]",<function load_excitation_current at 0x7f32c10...
