In [1]:
import pandas as pd
import numpy as np
from glob import glob
import seaborn as sns
import matplotlib
%matplotlib inline

matplotlib.rc('pdf', fonttype=42)

datadir = '../datasets/pmlb/datasets/'
frames = []
for f in glob(datadir+'/*/*.tsv.gz'):
    df = pd.read_csv(f,sep='\t') 
    group = 'feynman' if 'feynman' in f else 'strogatz' if 'strogatz' in f else 'black-box'
    frames.append(dict(
        name=f.split('/')[-1][:-7],
        nsamples = df.shape[0],
        nfeatures = df.shape[1],
        npoints = df.shape[0]*df.shape[1],
        Group=group
    ))
    
df = pd.DataFrame.from_records(frames)

df.loc[:,'friedman_dataset'] = np.where(df['name'].str.contains('_fri_'), "Friedman", "Non-Friedman")

df = df.sort_values(by=['friedman_dataset'], ascending=False)

df

Unnamed: 0,name,nsamples,nfeatures,npoints,Group,friedman_dataset
0,feynman_II_3_24,100000,3,300000,feynman,Non-Friedman
194,spambase,4601,58,266858,black-box,Non-Friedman
276,agaricus_lepiota,8145,23,187335,black-box,Non-Friedman
275,feynman_III_9_52,100000,7,700000,feynman,Non-Friedman
274,230_machine_cpu,209,7,1463,black-box,Non-Friedman
...,...,...,...,...,...,...
61,634_fri_c2_100_10,100,11,1100,black-box,Friedman
62,607_fri_c4_1000_50,1000,51,51000,black-box,Friedman
146,649_fri_c0_500_5,500,6,3000,black-box,Friedman
249,623_fri_c4_1000_10,1000,11,11000,black-box,Friedman


In [None]:
print((df[df['Group']=='black-box'])['name'])
(df[df['Group']=='black-box'])['friedman_dataset'].value_counts()

: 

In [2]:
df[df['Group']=='black-box'].describe()

Unnamed: 0,nsamples,nfeatures,npoints
count,286.0,286.0,286.0
mean,20866.01,28.86014,549312.2
std,123027.2,79.333903,3808498.0
min,32.0,2.0,156.0
25%,250.0,8.0,2453.0
50%,500.0,11.0,10040.0
75%,2000.0,26.0,59477.25
max,1025010.0,1001.0,54950000.0


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", palette='hls')
sns.set_context("paper", font_scale=1.75, rc={"lines.linewidth": 1.5})
sns.despine(left=True, bottom=True)

## PMLB dataset sizes
g = sns.scatterplot(
    data=df[df['Group']=='black-box'],
    x='nsamples',
    y='nfeatures',
    hue='friedman_dataset',
    alpha=0.9,
    linewidth=1.,
    edgecolor='k',
    s=100, 
)
ax = plt.gca()
plt.legend(loc='upper left')
ax.set_xscale('log')
ax.set_yscale('log')
plt.xlabel('No. of Samples')
plt.ylabel('No. of Features')
plt.tight_layout()
plt.savefig('figs/pmlb_size.pdf',dpi=400, bbox_inches='tight')

: 

: 