In [30]:
%load_ext autoreload
%autoreload 2
import os
import pickle as pkl
from typing import Dict, Any

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
import matplotlib as mpl
import matplotlib.pyplot as plt
from copy import deepcopy
mpl.rcParams['figure.dpi'] = 250

# change working directory to project root
if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('../..')
from experiments.notebooks import viz
from experiments.data_util import get_clean_dataset
from experiments.config.datasets import DATASETS_CLASSIFICATION, DATASETS_REGRESSION
from pmlb import fetch_data, classification_dataset_names
pd.options.display.max_rows = 100

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# classification dataset stats

In [31]:
metadata = []
columns = ['Name', 'Samples', 'Features', 'Class 0', 'Class 1', 'Majority class %']
for dset_name, dset_file, data_source in DATASETS_CLASSIFICATION:
    X, y, feat_names = get_clean_dataset(dset_file, data_source)
#     X = np.nan_to_num(X, 0)
    shape = X.shape
    class_counts = np.unique(y, return_counts=True)[1]
    metadata.append([dset_name.capitalize(), shape[0], shape[1], class_counts[0], class_counts[1],
                     np.round(100 * np.max(class_counts) / np.sum(class_counts), decimals=1)])

metadata = pd.DataFrame(metadata, columns=columns).sort_values(by=['Samples']) #.set_index('Name')
# print(metadata.to_latex(index=False))
metadata

Unnamed: 0,Name,Samples,Features,Class 0,Class 1,Majority class %
0,Sonar,208,60,111,97,53.4
1,Heart,270,15,150,120,55.6
2,Breast-cancer,277,17,196,81,70.8
3,Haberman,306,3,81,225,73.5
4,Ionosphere,351,34,126,225,64.1
5,Diabetes,768,8,500,268,65.1
6,German-credit,1000,20,300,700,70.0
7,Juvenile,3640,286,3153,487,86.6
8,Recidivism,6172,20,3182,2990,51.6
9,Credit,30000,33,23364,6636,77.9


# regression dataset names


In [33]:
metadata = []
columns = ['Name', 'Samples', 'Features', 'Mean', 'Std', 'Min', "Max"]
for dset_name, dset_file, data_source in DATASETS_REGRESSION:
#     print(dset_name, dset_file, data_source)
    X, y, feat_names = get_clean_dataset(dset_file, data_source)
#     X = np.nan_to_num(X, 0)
#     print(y.dtype)
#     print(np.unique(y))
    shape = X.shape
    metadata.append([dset_name.capitalize(), shape[0], shape[1], np.mean(y), np.std(y), np.min(y), np.max(y)])

metadata = pd.DataFrame(metadata, columns=columns).round(2).sort_values(by=['Samples'])
#.set_index('Name')
# print(metadata.to_latex(index=False))
metadata

Unnamed: 0,Name,Samples,Features,Mean,Std,Min,Max
0,Friedman1,200,10,14.42,4.87,3.77,26.88
1,Friedman2,200,4,480.07,386.97,7.54,1599.54
2,Friedman3,200,4,1.33,0.3,0.19,1.57
3,Diabetes,442,10,152.13,77.01,25.0,346.0
4,California-housing,442,10,152.13,77.01,25.0,346.0
6,Satellite-image,6435,36,3.67,2.21,1.0,7.0
5,Echo-months,17496,9,21.99,15.79,-4.4,74.56


In [34]:
X, y, feature_names = get_clean_dataset('friedman1', data_source='synthetic')
print('shapes', X.shape, y.shape, np.unique(y))

shapes (200, 10) (200,) [ 1.57216141  2.9571414   3.13305874  3.16442274  4.38710008  4.41581661
  5.10602464  5.34534711  6.37552911  7.11049636  7.20672822  7.58647387
  7.70499612  8.0931659   8.40316521  8.44649263  8.64745214  8.78135223
  8.83769163  8.86258293  8.96717446  9.00150904  9.16657582  9.45209601
  9.49905829  9.50790929  9.66638437  9.69218071  9.72454317  9.75046179
  9.75165863  9.99292388 10.00123931 10.00421994 10.06316978 10.09000446
 10.24433039 10.31151763 10.3369705  10.43233195 10.82004882 10.86890388
 11.05687987 11.15732597 11.24076098 11.28338495 11.31912686 11.36147304
 11.4442184  11.46397671 11.6603857  11.78524877 11.89910575 12.1152307
 12.14680601 12.18899412 12.21837223 12.24183275 12.39343453 12.43704545
 12.50773201 12.61293102 12.67864176 12.78434403 12.80491419 12.81697473
 12.85930441 13.07559807 13.25236236 13.38406314 13.42837454 13.43718767
 13.56712993 13.5772164  13.63523983 13.64526955 13.79227732 13.83060518
 13.84487029 13.85316192 13.

In [8]:
from imodels import C45TreeClassifier
m = C45TreeClassifier(max_rules=100)
m.fit(X, y)
print('mse', np.mean(np.square(m.predict(X) - y)))
print(m)

(768, 8)