In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

import pandas as pd
import numpy as np

import sklearn

import imodels

# Data Check

Here we check the data sets we found through the references authors of HS paper provide. Each data set is accompanied with code for parsing it and the data set authors provide in their Github repo.

# 1. Classification data sets

## Heart
Link: https://archive.ics.uci.edu/ml/datasets/Statlog+%28Heart%29

Same: __TRUE__

The authors of HS have normalized the variables, and split variable "thal" into three different columns. This was veryfied by looking at the distribution of "thal" variable and variables "att_13_-1.0", "att_13_0.5", "att_13_1.0".

In [None]:
HEART_COLS_TRANSLATE = {
    0: "age", 1: "sex", 2: "chest_pain_type", 3: "resting_blood_pressure",  4: "serum_cholesterol",
    5: "fasting_blood_sugar", 6: "resting_electrocardiographic_results", 7: "maximum_heart_rate_achieved",
    8: "exercise_induced_angina", 9: "oldpeak", 10: "slope_of_the_peak", 11: "number_of_major_vessels",
    12: "thal", 13: "heart_disease"
}
HEART_COLS_REAL = ["age", "resting_blood_pressure", "serum_cholesterol", "maximum_heart_rate_achieved",
            "oldpeak", "number_of_major_vessels"]
HEART_COLS_BINARY = ["sex", "fasting_blood_sugar", "exercise_induced_angina", "heart_disease"]
HEART_COLS_ORDERED = ["slope_of_the_peak"]
HEART_COLS_NOMINAL = ["chest_pain_type", "resting_electrocardiographic_results", "thal"]


heart = pd.DataFrame(columns=list(range(len(HEART_COLS_TRANSLATE))))
with open(os.path.abspath("../data/classification/heart.dat"), "r") as f:
    for line in f:
        params = line.strip().split(" ")
        heart = pd.concat([heart, pd.DataFrame.from_dict({i: [v] for i, v in enumerate(params)})], ignore_index=True)

heart = heart.rename(HEART_COLS_TRANSLATE, axis=1)
for col in HEART_COLS_REAL:
    heart[col] = heart[col].astype(float)
for col in HEART_COLS_BINARY+HEART_COLS_NOMINAL:
    heart[col] = heart[col].astype(float)#.astype(int)
for col in HEART_COLS_ORDERED:
    heart[col] = heart[col].astype(float)#.astype(int).astype(str)
heart.head()

In [None]:
for col in heart.columns:
    _min = heart[col].min()
    _max = heart[col].max()
    heart[col] = 2 * (heart[col]-_min) / (_max-_min) -1

for col in HEART_COLS_BINARY+HEART_COLS_ORDERED+HEART_COLS_NOMINAL:
    heart[col] = (heart[col] + 1) / 2

heart.describe(include="all")

In [None]:
X, y, cols = imodels.util.data_util.get_clean_dataset("heart", "imodels")
heart_hs = pd.DataFrame(X)
heart_hs = heart_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
heart_hs["heart_disease"] = y
heart_hs.head()

In [None]:
heart_hs.describe(include="all")

## Breast cancer
Link: https://www.openml.org/search?type=data&status=active&sort=runs&id=13

Same: __TRUE__

At first glance the data sets seem different, but the authors of HS transform each feature into numeric and leave out values that are present in attribute definition but not in the data set. Authors also dropped all rows with unknown values.

In [None]:
BREAST_AGE_TRANSLATE = {
    "10-19": 0, "20-29": 1, "30-39": 2, "40-49": 3, "50-59": 4, "60-69": 5,
    "70-79": 6, "80-89": 7, "90-99": 8}
BREAST_TUMOR_TRANSLATE = {
    "0-4": 0, "5-9": 1, "10-14":2 , "15-19": 3, "20-24": 4, "25-29": 5, "30-34": 6,
    "35-39": 7, "40-44": 8, "45-49": 9, "50-54": 10, "55-59": 11}
BREAST_INV_TRANSLATE = {
    "0-2": 0, "3-5": 1, "6-8": 2, "9-11": 3, "12-14": 4, "15-17": 5, "18-20": 6,
    "21-23": 7, "24-26": 8, "27-29": 9, "30-32": 10, "33-35": 11, "36-39": 12}


breast_cancer = pd.DataFrame(columns=list(range(10)))
with open(os.path.abspath("../data/classification/dataset_13_breast-cancer.arff"), "r") as f:
    header = list()
    for line in f:
        if line.startswith("@attribute"):
            header.append(line.split("'")[1])
        if line.startswith("%") or line.startswith("@"):
            continue
        params = line.strip().replace("'", "").split(",")
        breast_cancer = pd.concat([breast_cancer, pd.DataFrame.from_dict({i: [v] for i, v in enumerate(params)})], ignore_index=True)

breast_cancer = breast_cancer.rename({i: v for i, v in enumerate(header)}, axis=1)
breast_cancer = breast_cancer.replace("?", np.nan).dropna()
breast_cancer["age"] = breast_cancer["age"].apply(lambda x: BREAST_AGE_TRANSLATE[x])
breast_cancer["tumor-size"] = breast_cancer["tumor-size"].apply(lambda x: BREAST_TUMOR_TRANSLATE[x])
breast_cancer["inv-nodes"] = breast_cancer["inv-nodes"].apply(lambda x: BREAST_INV_TRANSLATE[x])
breast_cancer["node-caps"] = (breast_cancer["node-caps"] == "yes")*1
breast_cancer["breast"] = (breast_cancer["breast"] == "right")*1
breast_cancer["irradiat"] = (breast_cancer["irradiat"] == "yes")*1
breast_cancer["Class"] = (breast_cancer["Class"] == "recurrence-events")*1
breast_cancer = pd.get_dummies(breast_cancer, columns=["deg-malig", "menopause", "breast-quad"])
breast_cancer.head()

In [None]:
breast_cancer.describe(include="all")

In [None]:
X, y, cols = imodels.util.data_util.get_clean_dataset("breast_cancer", "imodels")
breast_cancer_hs = pd.DataFrame(X)
breast_cancer_hs = breast_cancer_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
breast_cancer_hs["cancer"] = y
breast_cancer_hs.head()

In [None]:
breast_cancer_hs.describe(include="all")

## Haberman
Link: https://archive.ics.uci.edu/ml/datasets/Haberman%27s+Survival

Same: __TRUE__

Exact match for the paper. The data set that the authors provide is the same, but they do subtract 58 from the column "Patients_year_of_operation".

In [None]:
HABERMAN_COLS_TRANSLATE = {
    0: "age", 1: "year_of_operation", 2: "positive_axillary_nodes_detected", 3: "survival"
}

haberman = pd.DataFrame(columns=list(range(len(HABERMAN_COLS_TRANSLATE))))
with open(os.path.abspath("../data/classification/haberman.data"), "r") as f:
    for line in f:
        params = line.strip().split(",")
        haberman = pd.concat([haberman, pd.DataFrame.from_dict({i: [v] for i, v in enumerate(params)})], ignore_index=True)

haberman = haberman.rename(HABERMAN_COLS_TRANSLATE, axis=1)
haberman = haberman.astype(int)
haberman["year_of_operation"] = haberman["year_of_operation"]-58
haberman["survival"] = -1*haberman["survival"]+2
haberman.head()

In [None]:
haberman.describe()

In [None]:
X, y, cols = imodels.util.data_util.get_clean_dataset("haberman", "imodels")
haberman_hs = pd.DataFrame(X)
haberman_hs = haberman_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
haberman_hs["survival"] = y
haberman_hs.head()

In [None]:
haberman_hs.describe()

## Ionosphere
Link: https://archive.ics.uci.edu/ml/datasets/Ionosphere

Same: __TRUE__

Exact match for the paper.

In [None]:
IONOSPHERE_COLS_TRANSLATE = {
    i: f"attr_{i}" for i in range(34)
}
IONOSPHERE_COLS_TRANSLATE[34] = "ionosphere"

ionosphere = pd.DataFrame(columns=list(range(len(IONOSPHERE_COLS_TRANSLATE))))
with open(os.path.abspath("../data/classification/ionosphere.data"), "r") as f:
    for line in f:
        params = line.strip().split(",")
        ionosphere = pd.concat([ionosphere, pd.DataFrame.from_dict({i: [v] for i, v in enumerate(params)})], ignore_index=True)

ionosphere = ionosphere.rename(IONOSPHERE_COLS_TRANSLATE, axis=1)
ionosphere["ionosphere"] = ionosphere["ionosphere"] == "g"
ionosphere.head()

In [None]:
ionosphere.describe(include="all")

In [None]:
X, y, cols = imodels.util.data_util.get_clean_dataset("ionosphere", "pmlb")
ionosphere_hs = pd.DataFrame(X)
ionosphere_hs = ionosphere_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
ionosphere_hs["ionosphere"] = y
ionosphere_hs.head()

In [None]:
ionosphere_hs.describe(include="all")

## Diabetes
Link: https://www.kaggle.com/datasets/mathchi/diabetes-data-set

Same: __TRUE__

Exact match for the paper. The rows are scrambled, but we can see that the data distribution is the same for all columns in both data sets.

Note: the data set was found on Kaggle and not on the UCI as the authors of Random Forest paper state.

In [None]:
diabetes = pd.read_csv(os.path.abspath("../data/classification/diabetes.csv"))
diabetes.head()

In [None]:
diabetes.describe(include="all")

In [None]:
X, y, cols = imodels.util.data_util.get_clean_dataset("diabetes", "pmlb")
diabetes_hs = pd.DataFrame(X)
diabetes_hs = diabetes_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
diabetes_hs["diabetes"] = y
diabetes_hs.head()

In [None]:
diabetes_hs.describe(include="all")

## German credit
Link: https://archive.ics.uci.edu/ml/datasets/South+German+Credit+%28UPDATE%29

Same: __FALSE__

Has the same number of features and instances, features also coinside with the german names. But the data distribution of the majority of variables is different. Here are variables with the same distributions:
- duration
- amount/credit
- installment rate
- age
- number credits/existing credits

Target variable is the same. Most of the other variables seem to have different range (found data set has column range from 1 to 5, data set from authors of HS have from 0 to 4). Even if we scale the data set back, there is still difference between the distributions.

In [None]:
GERMAN_COLS_TRANSLATE = [
    "status", "duration", "credit_history", "purpose", "amount", "savings",
    "employment_duration", "installment_rate", "personal_status_sex",
    "other_debtors", "present_residence", "property", "age",
    "other_installment_plans", "housing", "number_credits", "job", "people_liable",
    "telephone", "foreign_worker", "credit_risk"]
GERMAN_COLS_TRANSLATE = {i: v for i, v in enumerate(GERMAN_COLS_TRANSLATE)}
GERMAN_COLS_MINUS = [
    "status", "savings", "employment_duration", "personal_status_sex", "other_debtors",
    "property", "other_installment_plans", "housing", "job", "telephone", "foreign_worker"
]

german_credit = pd.DataFrame(columns=list(range(len(GERMAN_COLS_TRANSLATE))))
with open(os.path.abspath("../data/classification/SouthGermanCredit.asc"), "r") as f:
    _ = f.readline().strip().split(" ") #skip header, which is in German
    for line in f:
        params = line.strip().split(" ")
        german_credit = pd.concat([german_credit, pd.DataFrame.from_dict({i: [v] for i, v in enumerate(params)})], ignore_index=True)
german_credit = german_credit.rename(GERMAN_COLS_TRANSLATE, axis=1)
german_credit = german_credit.astype(int)
for col in GERMAN_COLS_MINUS:
    german_credit[col] = german_credit[col] - 1
german_credit.head()

In [None]:
german_credit.describe(include="all")

In [None]:
X, y, cols = imodels.util.data_util.get_clean_dataset("german", "pmlb")
german_credit_hs = pd.DataFrame(X)
german_credit_hs = german_credit_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
german_credit_hs["credit"] = y
german_credit_hs.head()

In [None]:
german_credit_hs.describe(include="all")

## Juvenile
Link: https://www.icpsr.umich.edu/web/NACJD/studies/3986

Same: __TRUE?__

The data set from the upper link has too many columns for us to manually try and get to the format of the authors used data set, but upon looking at the authors code from `https://github.com/csinva/imodels-data/blob/master/notebooks_fetch_data/00_get_datasets_custom.ipynb` we can see that they use the same data set, but clean it a lot.

In [None]:
X, y, cols = imodels.util.data_util.get_clean_dataset("juvenile_clean", "imodels")
juvenile_hs = pd.DataFrame(X)
juvenile_hs = juvenile_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
juvenile_hs["target"] = y
juvenile_hs.head()

In [None]:
juvenile_hs.describe(include="all")

## Recidivism
Link: https://www.propublica.org/datastore/dataset/compas-recidivism-risk-score-data-and-analysis

Same: __TRUE__

The authors performed some one-hot-encodings (columns race, age, sex, c_charge_degree).

In [None]:
RECIDIVISM_COLS_KEEP = [
    "age", "priors_count", "days_b_screening_arrest", "c_jail_time",
    "juv_fel_count", "juv_other_count", "juv_misd_count", "c_charge_degree", "race", "sex"
]

recidivism = pd.read_csv(os.path.abspath("../data/classification/compas-scores-two-years.csv"))
recidivism["c_jail_time"] = (pd.to_datetime(recidivism["c_jail_out"])-pd.to_datetime(recidivism["c_jail_in"])) // np.timedelta64(1, "D")
recidivism = recidivism[RECIDIVISM_COLS_KEEP]
recidivism.head()

In [None]:
X, y, cols = imodels.util.data_util.get_clean_dataset("compas_two_year_clean", "imodels")
recidivism_hs = pd.DataFrame(X)
recidivism_hs = recidivism_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
recidivism_hs["target"] = y
recidivism_hs.head()

In [None]:
recidivism_hs.describe(include="all")

# 2. Regression data sets

## Friedman 1 & Friedman 3
Link: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_friedman1.html

Link: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_friedman3.html

Same: __TRUE__

Both data sets are synthetic, and the authors also use the same Scikit-learn functions to generate them.

In [None]:
X, y = sklearn.datasets.make_friedman1(200, 10)
friedman1 = pd.DataFrame(X)
friedman1["target"] = y
friedman1.head()

In [None]:
friedman1.describe(include="all")

In [None]:
X, y, cols = imodels.util.data_util.get_clean_dataset("friedman1", "synthetic")
friedman1_hs = pd.DataFrame(X)
friedman1_hs = friedman1_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
friedman1_hs["target"] = y
friedman1_hs.head()

In [None]:
friedman1_hs.describe(include="all")

## Diabetes

Link: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html

Same: __TRUE__

Also part of the Scikit-learn package, and the authors provide the same data in their imodels package.

In [None]:
diabetes_data = sklearn.datasets.load_diabetes(as_frame=True)
diabetes = diabetes_data["data"]
diabetes["diabetes"] = diabetes_data["target"]
diabetes.head()

In [None]:
diabetes.describe(include="all")

In [None]:
X, y, cols = imodels.util.data_util.get_clean_dataset("diabetes", "sklearn")
diabetes_hs = pd.DataFrame(X)
diabetes_hs = diabetes_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
diabetes_hs["diabetes"] = y
diabetes_hs.head()

In [None]:
diabetes_hs.describe(include="all")

## Geographical music

Link: https://epistasislab.github.io/pmlb/profile/4544_GeographicalOriginalofMusic.html

Link: https://github.com/EpistasisLab/pmlb/tree/master/datasets/4544_GeographicalOriginalofMusic

Same: __TRUE__

The authors do not provide this data set in their code, but the number of features and samples match.

Note: We found the data set online on PMLB and were able to read it with `imodels.util.data_util.get_clean_dataset`.

In [None]:
geographical_music = pd.read_csv(os.path.abspath("../data/regression/geographical_music.tsv"), sep="\t")
geographical_music.head()

In [None]:
geographical_music.describe(include="all")

In [None]:
X, y, cols = imodels.util.data_util.get_clean_dataset("4544_GeographicalOriginalofMusic", "pmlb")
geographical_music_hs = pd.DataFrame(X)
geographical_music_hs = geographical_music_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
geographical_music_hs["target"] = y
geographical_music_hs.head()

In [None]:
geographical_music_hs.describe(include="all")

## Red wine

Link: https://archive.ics.uci.edu/ml/datasets/Wine+Quality

Same: __TRUE__

The authors do not provide this data set in their code, but the number of features and samples match.

Note: We found the data set online on PMLB and were able to read it with `imodels.util.data_util.get_clean_dataset`.

In [None]:
red_wine = pd.read_csv(os.path.abspath("../data/regression/winequality-red.csv"), sep=";")
red_wine.head()

In [None]:
red_wine.describe(include="all")

In [None]:
X, y, cols = imodels.util.data_util.get_clean_dataset("wine_quality_red", "pmlb")
red_wine_hs = pd.DataFrame(X)
red_wine_hs = red_wine_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
red_wine_hs["quality"] = y
red_wine_hs.head()

In [None]:
red_wine_hs.describe(include="all")

## Abalone

Link: https://archive.ics.uci.edu/ml/datasets/Abalone

Same: __TRUE__



In [None]:
ABALONE_COLS_TRANSLATE = {
    0: "Sex", 1: "Length", 2: "Diameter", 3: "Height", 4: "Whole_weight",
    5: "Shucked_weight", 6: "Viscera_weight", 7: "Shell_weight", 8: "Rings"
}
ABALONE_SEX_TRANSLATE = {
    "M": 2, "F": 0, "I": 1
}

abalone = pd.DataFrame(columns=list(range(len(ABALONE_COLS_TRANSLATE))))
with open(os.path.abspath("../data/regression/abalone.data"), "r") as f:
    for line in f:
        params = line.strip().split(",")
        abalone = pd.concat([abalone, pd.DataFrame.from_dict({i: [v] for i, v in enumerate(params)})], ignore_index=True)
abalone = abalone.rename(ABALONE_COLS_TRANSLATE, axis=1)
abalone["Sex"] = abalone["Sex"].apply(lambda x: ABALONE_SEX_TRANSLATE[x])
abalone = abalone.astype(float)
abalone.head()

In [None]:
abalone.describe(include="all")

In [None]:
X, y, cols = imodels.util.data_util.get_clean_dataset("183", "openml")
abalone_hs = pd.DataFrame(X)
abalone_hs = abalone_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
abalone_hs["Rings"] = y
abalone_hs.head()

In [None]:
abalone_hs.describe(include="all")

## Satellite image

Link: https://epistasislab.github.io/pmlb/profile/294_satellite_image.html
    
Link: https://github.com/EpistasisLab/pmlb/blob/master/datasets/294_satellite_image

Same: __TRUE__

In [None]:
satellite = pd.read_csv(os.path.abspath("../data/regression/satellite_image.tsv"), sep="\t")
satellite.head()

In [None]:
satellite.describe(include="all")

In [None]:
X, y, cols = imodels.util.data_util.get_clean_dataset("294_satellite_image", "pmlb")
satellite_hs = pd.DataFrame(X)
satellite_hs = satellite_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
satellite_hs["target"] = y
satellite_hs.head()

In [None]:
satellite_hs.describe(include="all")

## CA housing

Link: https://www.kaggle.com/datasets/camnugent/california-housing-prices

Same: __TRUE__

The columns are swapped, but the data sets are the same.

In [None]:
HOUSING_COLS_TRANSLATE = {
    0: "longitude",
    1: "latitude",
    2: "housingMedianAge",
    3: "totalRooms",
    4: "totalBedrooms",
    5: "population",
    6: "households",
    7: "medianIncome",
    8: "medianHouseValue"
}

ca_housing = pd.DataFrame(columns=list(range(len(HOUSING_COLS_TRANSLATE))))
with open(os.path.abspath("../data/regression/ca_housing.data"), "r") as f:
    for line in f:
        params = line.strip().split(",")
        ca_housing = pd.concat([ca_housing, pd.DataFrame.from_dict({i: [v] for i, v in enumerate(params)})], ignore_index=True)
ca_housing = ca_housing.astype(float)
ca_housing = ca_housing.rename(HOUSING_COLS_TRANSLATE, axis=1)
ca_housing.head()

In [None]:
ca_housing.describe(include="all")

In [None]:
X, y, cols = imodels.util.data_util.get_clean_dataset("california_housing", "sklearn")
ca_housing_hs = pd.DataFrame(X)
ca_housing_hs = ca_housing_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
ca_housing_hs["medianHouseValue"] = y
ca_housing_hs.head()

In [None]:
ca_housing_hs.describe(include="all")