In [5]:
%load_ext autoreload
%autoreload 2
import os
import pickle as pkl
from typing import Dict, Any

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
import matplotlib as mpl
import matplotlib.pyplot as plt
from copy import deepcopy
mpl.rcParams['figure.dpi'] = 250

# change working directory to project root
if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('../..')
from experiments.notebooks import viz
from experiments.data_util import get_clean_dataset
from experiments.config.datasets import DATASETS_CLASSIFICATION, DATASETS_REGRESSION
from pmlb import fetch_data, classification_dataset_names
pd.options.display.max_rows = 100

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# classification dataset stats

In [None]:
metadata = []
columns = ['Name', 'Samples', 'Features', 'Class 0', 'Class 1', 'Majority class %']
for dset_name, dset_file, data_source in DATASETS_CLASSIFICATION:
    X, y, feat_names = get_clean_dataset(dset_file, data_source)
#     X = np.nan_to_num(X, 0)
    shape = X.shape
    class_counts = np.unique(y, return_counts=True)[1]
    metadata.append([dset_name.capitalize(), shape[0], shape[1], class_counts[0], class_counts[1],
                     np.round(100 * np.max(class_counts) / np.sum(class_counts), decimals=1)])

metadata = pd.DataFrame(metadata, columns=columns).sort_values(by=['Samples']) #.set_index('Name')
# print(metadata.to_latex(index=False))
metadata

# regression dataset names


In [7]:
metadata = []
columns = ['Name', 'Samples', 'Features', 'Mean', 'Std', 'Min', "Max"]
for dset_name, dset_file, data_source in DATASETS_REGRESSION:
#     print(dset_name, dset_file, data_source)
    X, y, feat_names = get_clean_dataset(dset_file, data_source)
#     X = np.nan_to_num(X, 0)
#     print(y.dtype)
#     print(np.unique(y))
    shape = X.shape
    metadata.append([dset_name.capitalize(), shape[0], shape[1], np.mean(y), np.std(y), np.min(y), np.max(y)])

metadata = pd.DataFrame(metadata, columns=columns).round(2).sort_values(by=['Samples'])
#.set_index('Name')
# print(metadata.to_latex(index=False))
metadata

Unnamed: 0,Name,Samples,Features,Mean,Std,Min,Max
0,Friedman1,200,10,14.25,4.56,1.71,26.95
1,Friedman2,200,4,464.02,390.78,7.15,1699.68
2,Friedman3,200,4,1.32,0.32,0.02,1.57
3,Diabetes-regr,442,10,152.13,77.01,25.0,346.0
4,California-housing,442,10,152.13,77.01,25.0,346.0
5,Satellite-image,6435,36,3.67,2.21,1.0,7.0
6,Echo-months,17496,9,21.99,15.79,-4.4,74.56


In [8]:
X, y, feature_names = get_clean_dataset('friedman1', data_source='synthetic')
print('shapes', X.shape, y.shape, np.unique(y))

shapes (200, 10) (200,) [ 1.54807941  2.82584089  3.26260264  3.58710555  5.11763798  5.42531158
  5.63301555  5.74722517  6.10242782  6.20489243  6.56014744  6.85847467
  7.18417699  7.36325666  7.39342959  7.44751827  8.09587663  8.09797839
  8.22070863  8.25235835  8.29692857  8.57150104  8.61788598  8.82239744
  8.89802983  9.08491611  9.20334603  9.21966825  9.24019214  9.25595083
  9.49387347  9.64898803  9.71182684  9.80450925  9.95905093  9.98434944
  9.9975615  10.23995605 10.35352875 10.4029495  10.44838625 10.46052791
 10.52496695 10.60750259 10.64587348 10.76111163 10.83940749 10.85406208
 10.86375354 10.9408716  10.95164538 10.95342806 10.96393778 11.00683843
 11.01975958 11.26092686 11.48717995 11.51290823 11.59509797 11.77250693
 11.92633296 11.9662568  12.02930547 12.08237057 12.10473253 12.42237003
 12.80092649 12.81473684 12.84739159 12.8819466  12.90865831 12.91331892
 12.94010616 12.96911324 13.00576323 13.11220829 13.18642017 13.28761968
 13.49001963 13.6188573  13

In [10]:
X, y, feature_names = get_clean_dataset('diabetes', data_source='pmlb')
print('shapes', X.shape, y.shape, np.unique(y))

shapes (768, 8) (768,) [0 1]


In [8]:
from imodels import C45TreeClassifier
m = C45TreeClassifier(max_rules=100)
m.fit(X, y)
print('mse', np.mean(np.square(m.predict(X) - y)))
print(m)

(768, 8)