This notebook uses gdown to download files.
You can download it with `pip install gdown`.
Alternatively, you can download zip files manually and then run the other commands (or download everything from [this folder](https://drive.google.com/drive/folders/1e9cEAuQsLABU5SOkQF982Pw09wPIJbn7?usp=sharing)).

In [9]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import os

from imodels.util.dta_util import get_openml_dataset

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# breast cancer
https://www.openml.org/d/13

In [47]:
df = get_openml_dataset(13)
df = df.dropna()

categorical_features = ['menopause', 'breast-quad', 'deg-malig']
df_enc = pd.get_dummies(df, columns=categorical_features)

X, y = df_enc.drop('recurrence-events', axis=1), df_enc['recurrence-events']
df_clean = pd.concat([X, y], axis=1)
df_clean.to_csv('data/breast_cancer.csv', index=False)

orig_feature_count = len(df.columns) - 1
orig_feature_count

9

# german credit
https://www.openml.org/d/31

In [48]:
df = get_openml_dataset(31)

categorical_features = ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 
                        'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans',
                        'housing', 'job', 'num_dependents']
df_enc = pd.get_dummies(df, columns=categorical_features)

X, y = df_enc.drop('good', axis=1), df_enc['good']
df_clean = pd.concat([X, y], axis=1)
df_clean.to_csv('data/credit_g.csv', index=False)

orig_feature_count = len(df.columns) - 1
orig_feature_count

20

# haberman
https://www.openml.org/d/43

In [49]:
df = get_openml_dataset(43)
df.to_csv('data/haberman.csv', index=False)
orig_feature_count = len(df.columns) - 1
orig_feature_count

3

# heart
https://www.openml.org/d/1574

In [50]:
df = get_openml_dataset(1574)
df = df.rename(columns={1.0:'target'})

categorical_features = ['att_13']

df_enc = pd.get_dummies(df, columns=categorical_features)

for col in [f'att_{i}' for i in [2, 6, 7, 9, 11]]:
    df_enc[col] = (df_enc[col] == 1.0).astype(int)

X, y = df_enc.drop('target', axis=1), df_enc['target']
df_clean = pd.concat([X, y], axis=1)
df_clean.to_csv('data/heart.csv', index=False)

orig_feature_count = len(df.columns) - 1
orig_feature_count

13