In [1]:
%load_ext autoreload
%autoreload 2
import os
from os.path import join as oj

import numpy as np
import pandas as pd

pd.options.display.max_rows = 100

In [2]:
def add_citations(m_print):
    citations = {'ionosphere': 'sigillito1989classification',
                 'diabetes': 'smith1988using',
                 #              'german-credit': '',
                 'juvenile': 'osofsky1997effects',
                 'credit': 'yeh2009comparisons',
                 #              'recidivism',
                 #              'readmission': '',
                 }

    for i in m_print.index.values:
        name = m_print.at[i, 'Name'].lower().replace('_clean', '').replace('_two_year', '').replace('_', ' ')
        if name in citations:
            m_print.at[i, 'Name'] = name.capitalize().replace('-', ' ') + ' \cite{' + citations[name] + '}'
        else:
            m_print.at[i, 'Name'] = name.capitalize().replace('-', ' ')
    return m_print

In [3]:
metadata = []
columns = ['Name', 'Samples', 'Features', 'Class 0', 'Class 1', 'Majority class %']
for dset_name in os.listdir('data_cleaned'):
    df = pd.read_csv(oj('data_cleaned', dset_name))
    X, y = df.iloc[:, :-1].values, df.iloc[:, -1].values
    feature_names = df.columns.values[:-1]

    #     X = np.nan_to_num(X, 0)
    shape = X.shape
    class_counts = np.unique(y, return_counts=True)[1]
    metadata.append([dset_name.capitalize()[:-4], shape[0], shape[1], class_counts[0], class_counts[1],
                     np.round(100 * np.max(class_counts) / np.sum(class_counts), decimals=1)])

metadata = pd.DataFrame(metadata, columns=columns).sort_values(by=['Samples'])
metadata = add_citations(metadata)

In [4]:
print(metadata.to_markdown(index=False))

| Name                               |   Samples |   Features |   Class 0 |   Class 1 |   Majority class % |
|:-----------------------------------|----------:|-----------:|----------:|----------:|-------------------:|
| Heart                              |       270 |         15 |       150 |       120 |               55.6 |
| Breast cancer                      |       277 |         17 |       196 |        81 |               70.8 |
| Haberman                           |       306 |          3 |        81 |       225 |               73.5 |
| Credit g                           |      1000 |         60 |       300 |       700 |               70   |
| Csi all                            |      3313 |         36 |      2773 |       540 |               83.7 |
| Csi with meta keys                 |      3313 |         25 |      2773 |       540 |               83.7 |
| Juvenile \cite{osofsky1997effects} |      3640 |        286 |      3153 |       487 |               86.6 |
| Compas           