In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/EpistasisLab/pmlb/master/pmlb/all_summary_stats.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,dataset,n_instances,n_features,n_binary_features,n_categorical_features,n_continuous_features,endpoint_type,n_classes,imbalance,task
0,1027_ESL,488,4,0,4,0,continuous,9.0,0.099363,regression
1,1028_SWD,1000,10,1,9,0,continuous,4.0,0.108291,regression
2,1029_LEV,1000,4,0,4,0,continuous,5.0,0.111245,regression
3,1030_ERA,1000,4,0,0,4,continuous,9.0,0.031251,regression
4,1089_USCrime,47,13,1,0,12,continuous,42.0,0.00297,regression


In [4]:
df['dimensionality'] = df['n_instances'] * df['n_features']

### Obter apenas os datasets de classificação

In [5]:
classification_df = df[df['task'] == 'classification']

In [6]:
classification_df.shape

(162, 11)

In [7]:
classification_df[(classification_df['n_features'] * classification_df['n_instances']) < 50000]

Unnamed: 0,dataset,n_instances,n_features,n_binary_features,n_categorical_features,n_continuous_features,endpoint_type,n_classes,imbalance,task,dimensionality
121,GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1,1600,20,0,20,0,categorical,2.0,0.000000,classification,32000
122,GAMETES_Epistasis_2_Way_20atts_0.4H_EDM_1_1,1600,20,2,18,0,categorical,2.0,0.000000,classification,32000
123,GAMETES_Epistasis_3_Way_20atts_0.2H_EDM_1_1,1600,20,0,20,0,categorical,2.0,0.000000,classification,32000
124,GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_...,1600,20,1,19,0,categorical,2.0,0.000000,classification,32000
125,GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_...,1600,20,1,19,0,categorical,2.0,0.000000,classification,32000
...,...,...,...,...,...,...,...,...,...,...,...
411,wdbc,569,30,0,0,30,categorical,2.0,0.064940,classification,17070
412,wine_quality_red,1599,11,0,0,11,categorical,6.0,0.228804,classification,17589
414,wine_recognition,178,13,0,0,13,categorical,3.0,0.012530,classification,2314
415,xd6,973,9,9,0,0,categorical,2.0,0.114332,classification,8757


### Remover colunas (a princípio) inúteis

In [8]:
classification_df = classification_df.drop(columns=['n_binary_features', 'endpoint_type'])

In [9]:
classification_df.head()

Unnamed: 0,dataset,n_instances,n_features,n_categorical_features,n_continuous_features,n_classes,imbalance,task,dimensionality
120,GAMETES_Epistasis_2_Way_1000atts_0.4H_EDM_1_ED...,1600,1000,969,0,2.0,0.0,classification,1600000
121,GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1,1600,20,20,0,2.0,0.0,classification,32000
122,GAMETES_Epistasis_2_Way_20atts_0.4H_EDM_1_1,1600,20,18,0,2.0,0.0,classification,32000
123,GAMETES_Epistasis_3_Way_20atts_0.2H_EDM_1_1,1600,20,20,0,2.0,0.0,classification,32000
124,GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_...,1600,20,19,0,2.0,0.0,classification,32000


### Criar grupos baseados na dimensionalidade dos dados

In [10]:
bins = [500, 1000, 5000, 10000, 25000, 50000, 100000, 200000]
labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [11]:
classification_df['group'] = pd.cut(classification_df['dimensionality'], bins=bins, labels=labels)

In [12]:
classification_df.dropna(subset=['group'], inplace=True)

In [13]:
# classification_df['group'].value_counts().min()
sample_threshold = 6

In [14]:
def get_rows_from_group(dataframe: pd.DataFrame, label: str):
    return dataframe[dataframe['group'] == label]

In [15]:
balanced_dfs_from_group = {key: get_rows_from_group(classification_df, key) for key in labels}

In [16]:
balanced_indexes = {key: balanced_dfs_from_group.get(key).sample(n=sample_threshold) for key in labels}

In [17]:
final_df = pd.concat([df for df in balanced_indexes.values()])

In [18]:
final_df

Unnamed: 0,dataset,n_instances,n_features,n_categorical_features,n_continuous_features,n_classes,imbalance,task,dimensionality,group
316,haberman,306,3,0,3,2.0,0.221453,classification,918,A
365,postoperative_patient_data,88,8,6,0,2.0,0.206612,classification,704,A
399,tae,151,5,0,3,3.0,0.000307,classification,755,A
327,iris,150,4,0,4,3.0,0.0,classification,600,A
353,mux6,128,6,0,0,2.0,0.0,classification,768,A
174,cloud,108,7,0,6,4.0,0.00503,classification,756,A
361,penguins,344,7,2,4,3.0,0.046376,classification,2408,B
148,analcatdata_lawsuit,264,4,0,3,2.0,0.73284,classification,1056,B
366,prnn_crabs,200,7,0,6,2.0,0.0,classification,1400,B
367,prnn_fglass,205,9,0,9,5.0,0.106187,classification,1845,B


In [19]:
final_df.to_csv('datasets_stats.csv')