In [15]:
%load_ext autoreload
%autoreload 2
import os
import pickle as pkl
from typing import Dict, Any

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
import matplotlib as mpl
import matplotlib.pyplot as plt
from copy import deepcopy
mpl.rcParams['figure.dpi'] = 250

# change working directory to project root
if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('../..')
from experiments.notebooks import viz
from experiments.data_util import get_clean_dataset
from experiments.config.datasets import DATASETS_CLASSIFICATION, DATASETS_REGRESSION
from pmlb import fetch_data, classification_dataset_names
pd.options.display.max_rows = 100

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# classification dataset stats

In [16]:
metadata = []
columns = ['Name', 'Samples', 'Features', 'Class 0', 'Class 1', 'Majority class %']
for dset_name, dset_file, data_source in DATASETS_CLASSIFICATION:
    X, y, feat_names = get_clean_dataset(dset_file, data_source)
#     X = np.nan_to_num(X, 0)
    shape = X.shape
    class_counts = np.unique(y, return_counts=True)[1]
    metadata.append([dset_name.capitalize(), shape[0], shape[1], class_counts[0], class_counts[1],
                     np.round(100 * np.max(class_counts) / np.sum(class_counts), decimals=1)])

metadata = pd.DataFrame(metadata, columns=columns).sort_values(by=['Samples']) #.set_index('Name')
# print(metadata.to_latex(index=False))
metadata

Unnamed: 0,Name,Samples,Features,Class 0,Class 1,Majority class %
5,Sonar,208,60,111,97,53.4
0,Heart,270,15,150,120,55.6
1,Breast-cancer,277,17,196,81,70.8
2,Haberman,306,3,81,225,73.5
3,Ionosphere,351,34,126,225,64.1
4,Diabetes,768,8,500,268,65.1
6,Credit-g,1000,60,300,700,70.0
7,Juvenile,3640,286,3153,487,86.6
8,Recidivism,6172,20,3182,2990,51.6
9,Credit,30000,33,23364,6636,77.9


# regression dataset names


In [14]:
metadata = []
columns = ['Name', 'Samples', 'Features', 'Mean', 'Std', 'Min', "Max"]
for dset_name, dset_file, data_source in DATASETS_REGRESSION:
#     print(dset_name, dset_file, data_source)
    X, y, feat_names = get_clean_dataset(dset_file, data_source)
#     X = np.nan_to_num(X, 0)
#     print(y.dtype)
#     print(np.unique(y))
    shape = X.shape
    metadata.append([dset_name.capitalize(), shape[0], shape[1], np.mean(y), np.std(y), np.min(y), np.max(y)])

metadata = pd.DataFrame(metadata, columns=columns).round(2).sort_values(by=['Samples'])
#.set_index('Name')
# print(metadata.to_latex(index=False))
metadata

Unnamed: 0,Name,Samples,Features,Mean,Std,Min,Max
0,Diabetes,442,10,152.13,77.01,25.0,346.0
1,California-housing,442,10,152.13,77.01,25.0,346.0
3,Satellite-image,6435,36,3.67,2.21,1.0,7.0
2,Echo-months,17496,9,21.99,15.79,-4.4,74.56


In [27]:
from imodels import C45TreeClassifier
X, y, feature_names = get_clean_dataset('friedman1', data_source='synthetic')
print('shapes', X.shape, y.shape, np.unique(y))

shapes (200, 10) (200,) [ 4.29274646  5.36615368  5.55082675  5.97563545  6.20020115  6.51147375
  6.89021325  6.98524401  7.00902427  7.01048771  7.3444899   7.48450688
  7.53110525  7.67707685  7.77389502  8.27914882  8.29154257  8.3102553
  8.3582545   8.42176319  8.46476353  8.66902705  8.70593519  8.77762688
  8.92253206  8.97169255  9.23093166  9.23659141  9.26638846  9.29191947
  9.30481259  9.37683198  9.40237025  9.42495214  9.50635669  9.61521725
  9.86924201  9.91052942 10.11500763 10.11841681 10.33614855 10.33802665
 10.37158605 10.4581507  10.50852981 10.67958439 10.71460291 10.74925081
 10.82406405 10.86913601 10.95127937 11.04786898 11.17520167 11.20125918
 11.26368775 11.28052703 11.37464809 11.42237506 11.63321113 11.68437801
 11.70989906 11.88588893 11.89777415 11.95008549 12.01151876 12.0711667
 12.09296492 12.09855988 12.13444683 12.344002   12.39760798 12.43158256
 12.45793674 12.59333681 12.62680235 12.8919753  12.92168783 12.92467318
 12.97498566 13.05341425 13.1

In [8]:
m = C45TreeClassifier(max_rules=100)
m.fit(X, y)
print('mse', np.mean(np.square(m.predict(X) - y)))
print(m)

(768, 8)

In [9]:
y.shape

(768,)

In [5]:
X, y, feature_names = get_clean_dataset('59', data_source='openml')

In [6]:
X.shape

(351, 34)