In [23]:
%load_ext autoreload
%autoreload 2
import os
import pickle as pkl
from typing import Dict, Any

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
import matplotlib as mpl
import matplotlib.pyplot as plt
from copy import deepcopy
mpl.rcParams['figure.dpi'] = 250

# change working directory to project root
if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('../..')
from experiments.notebooks import viz
from experiments.data_util import get_clean_dataset
from experiments.config.datasets import DATASETS_CLASSIFICATION, DATASETS_REGRESSION
from pmlb import fetch_data, classification_dataset_names
pd.options.display.max_rows = 100

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# classification dataset stats

In [25]:
metadata = []
columns = ['Name', 'Samples', 'Features', 'Class 0', 'Class 1', 'Majority class %']
for dset_name, dset_file, data_source in DATASETS_CLASSIFICATION:
    X, y, feat_names = get_clean_dataset(dset_file, data_source)
#     X = np.nan_to_num(X, 0)
    shape = X.shape
    class_counts = np.unique(y, return_counts=True)[1]
    metadata.append([dset_name.capitalize(), shape[0], shape[1], class_counts[0], class_counts[1],
                     np.round(100 * np.max(class_counts) / np.sum(class_counts), decimals=1)])

metadata = pd.DataFrame(metadata, columns=columns) #.set_index('Name')
# print(metadata.to_latex(index=False))
metadata

Unnamed: 0,Name,Samples,Features,Class 0,Class 1,Majority class %
0,Recidivism,6172,20,3182,2990,51.6
1,Credit,30000,33,23364,6636,77.9
2,Juvenile,3640,286,3153,487,86.6
3,Readmission,101763,150,54861,46902,53.9
4,Ionosphere,351,34,126,225,64.1
5,Breast-cancer,277,17,196,81,70.8
6,Credit-g,1000,60,300,700,70.0
7,Haberman,306,3,81,225,73.5
8,Heart,270,15,150,120,55.6


# regression dataset names


In [22]:
metadata = []
columns = ['Name', 'Samples', 'Features', 'Mean', 'Std', 'Min', "Max"]
for dset_name, dset_file, data_source in DATASETS_REGRESSION:
#     print(dset_name, dset_file, data_source)
    X, y, feat_names = get_clean_dataset(dset_file, data_source)
#     X = np.nan_to_num(X, 0)
#     print(y.dtype)
#     print(np.unique(y))
    shape = X.shape
    metadata.append([dset_name.capitalize(), shape[0], shape[1], np.mean(y), np.std(y), np.min(y), np.max(y)])

metadata = pd.DataFrame(metadata, columns=columns).round(2) #.set_index('Name')
# print(metadata.to_latex(index=False))
metadata

Unnamed: 0,Name,Samples,Features,Mean,Std,Min,Max
0,Diabetes,442,10,152.13,77.01,25.0,346.0
1,California-housing,442,10,152.13,77.01,25.0,346.0
2,Breast-tumor,116640,9,24.69,10.35,-8.53,62.01
3,Echo-months,17496,9,21.99,15.79,-4.4,74.56
4,Satellite-image,6435,36,3.67,2.21,1.0,7.0


In [2]:
pmlb_meta = pd.read_csv('../data/pmlb_data/pmlb_metadata.csv').sort_values(by=['n_observations', 'n_features'], ascending=False)
pmlb_meta[(pmlb_meta.Task == 'regression') & ~(pmlb_meta.Dataset.str.contains('feynman'))].head(100)

In [19]:
from imodels import C45TreeClassifier
X, y, feature_names = get_clean_dataset('ionosphere', data_source='pmlb')
m = C45TreeClassifier(max_rules=100)
m.fit(X, y)
print('mse', np.mean(np.square(m.predict(X) - y)))
print(m)

mse 0.0
<?xml version="1.0" ?>
<GreedyTree>
	<X_4 feature="0.04198" flag="l" p="0.191">0</X_4>
	<X_4 feature="0.04198" flag="r" p="0.809">
		<X_0 feature="1.0" flag="l" p="0.067">0</X_0>
		<X_0 feature="1.0" flag="r" p="0.933">
			<X_2 feature="0.19466" flag="l" p="0.042">0</X_2>
			<X_2 feature="0.19466" flag="r" p="0.958">
				<X_7 feature="-0.9745" flag="l" p="0.031">0</X_7>
				<X_7 feature="-0.9745" flag="r" p="0.969">
					<X_17 feature="-0.79603" flag="l" p="0.012">0</X_17>
					<X_17 feature="-0.79603" flag="r" p="0.988">
						<X_3 feature="-0.92453" flag="l" p="0.012">0</X_3>
						<X_3 feature="-0.92453" flag="r" p="0.988">
							<X_15 feature="-0.83519" flag="l" p="0.008">0</X_15>
							<X_15 feature="-0.83519" flag="r" p="0.992">
								<X_5 feature="-0.7337100000000001" flag="l" p="0.013">
									<X_6 feature="1.0" flag="l" p="0.667">0</X_6>
									<X_6 feature="1.0" flag="r" p="0.333">1</X_6>
								</X_5>
								<X_5 feature="-0.733710000000