In [1]:
%load_ext autoreload
%autoreload 2
import os
import pickle as pkl
from typing import Dict, Any

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
import matplotlib as mpl
import matplotlib.pyplot as plt
from copy import deepcopy
mpl.rcParams['figure.dpi'] = 250

# change working directory to project root
if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('../..')
from experiments.notebooks import viz
from experiments.data_util import get_clean_dataset
from experiments.config.datasets import DATASETS

/Volumes/GoogleDrive/My Drive/research/rules/imodels/experiments/notebooks


# dataset stats

In [4]:
metadata = []
columns = ['Name', 'Samples', 'Features', 'Class 0', 'Class 1', 'Majority class %']
for dset_name, dset_file in DATASETS:
    X, y, feat_names = get_clean_dataset(dset_file)
#     X = np.nan_to_num(X, 0)
    shape = X.shape
    class_counts = np.unique(y, return_counts=True)[1]
    metadata.append([dset_name.capitalize(), shape[0], shape[1], class_counts[0], class_counts[1],
                     np.round(100 * np.max(class_counts) / np.sum(class_counts), decimals=1)])

metadata = pd.DataFrame(metadata, columns=columns) #.set_index('Name')
metadata

Unnamed: 0,Name,Samples,Features,Class 0,Class 1,Majority class %
0,Recidivism,6172,20,3182,2990,51.6
1,Credit,30000,33,23364,6636,77.9
2,Juvenile,3640,286,3153,487,86.6
3,Readmission,101763,150,54861,46902,53.9
4,Breast-cancer,277,17,196,81,70.8
5,Credit-g,1000,60,300,700,70.0
6,Haberman,306,3,81,225,73.5
7,Heart,270,15,150,120,55.6


In [5]:
print(metadata.to_latex(index=False))

\begin{tabular}{lrrrrr}
\toprule
         Name &  Samples &  Features &  Class 0 &  Class 1 &  Majority class \% \\
\midrule
   Recidivism &     6172 &        20 &     3182 &     2990 &              51.6 \\
       Credit &    30000 &        33 &    23364 &     6636 &              77.9 \\
     Juvenile &     3640 &       286 &     3153 &      487 &              86.6 \\
  Readmission &   101763 &       150 &    54861 &    46902 &              53.9 \\
Breast-cancer &      277 &        17 &      196 &       81 &              70.8 \\
     Credit-g &     1000 &        60 &      300 &      700 &              70.0 \\
     Haberman &      306 &         3 &       81 &      225 &              73.5 \\
        Heart &      270 &        15 &      150 &      120 &              55.6 \\
\bottomrule
\end{tabular}

