In [6]:
%load_ext autoreload
%autoreload 2

In [164]:
import os

import pandas as pd
import numpy as np

import imodels

# Data Check

Here we check the data sets we found through the references authors of HS paper provide. Each data set is accompanied with code for parsing it and the data set authors provide in their Github repo.

# 1. Classification data sets

## Heart
Link: https://archive.ics.uci.edu/ml/datasets/Statlog+%28Heart%29

Same: __TRUE__

The authors of HS have normalized the variables, and split variable "thal" into three different columns. This was veryfied by looking at the distribution of "thal" variable and variables "att_13_-1.0", "att_13_0.5", "att_13_1.0".

In [76]:
HEART_COLS_TRANSLATE = {
    0: "age", 1: "sex", 2: "chest_pain_type", 3: "resting_blood_pressure",  4: "serum_cholesterol",
    5: "fasting_blood_sugar", 6: "resting_electrocardiographic_results", 7: "maximum_heart_rate_achieved",
    8: "exercise_induced_angina", 9: "oldpeak", 10: "slope_of_the_peak", 11: "number_of_major_vessels",
    12: "thal", 13: "heart_disease"
}
HEART_COLS_REAL = ["age", "resting_blood_pressure", "serum_cholesterol", "maximum_heart_rate_achieved",
            "oldpeak", "number_of_major_vessels"]
HEART_COLS_BINARY = ["sex", "fasting_blood_sugar", "exercise_induced_angina", "heart_disease"]
HEART_COLS_ORDERED = ["slope_of_the_peak"]
HEART_COLS_NOMINAL = ["chest_pain_type", "resting_electrocardiographic_results", "thal"]


heart = pd.DataFrame(columns=list(range(len(HEART_COLS_TRANSLATE))))
with open(os.path.abspath("../data/classification/heart.dat"), "r") as f:
    for line in f:
        params = line.strip().split(" ")
        heart = pd.concat([heart, pd.DataFrame.from_dict({i: [v] for i, v in enumerate(params)})], ignore_index=True)

heart = heart.rename(HEART_COLS_TRANSLATE, axis=1)
for col in HEART_COLS_REAL:
    heart[col] = heart[col].astype(float)
for col in HEART_COLS_BINARY+HEART_COLS_NOMINAL:
    heart[col] = heart[col].astype(float)#.astype(int)
for col in HEART_COLS_ORDERED:
    heart[col] = heart[col].astype(float)#.astype(int).astype(str)
heart.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholesterol,fasting_blood_sugar,resting_electrocardiographic_results,maximum_heart_rate_achieved,exercise_induced_angina,oldpeak,slope_of_the_peak,number_of_major_vessels,thal,heart_disease
0,70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,2.0
1,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,1.0
2,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,2.0
3,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,1.0
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,1.0


In [78]:
for col in heart.columns:
    _min = heart[col].min()
    _max = heart[col].max()
    heart[col] = 2 * (heart[col]-_min) / (_max-_min) -1

for col in COLS_BINARY+COLS_ORDERED+COLS_NOMINAL:
    heart[col] = (heart[col] + 1) / 2

heart.describe(include="all")

In [113]:
X, y, cols = imodels.util.data_util.get_clean_dataset("heart", "imodels")
heart_hs = pd.DataFrame(X)
heart_hs = heart_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
heart_hs["heart_disease"] = y
heart_hs.head()

Unnamed: 0,att_1,att_2,att_3,att_4,att_5,att_6,att_7,att_8,att_9,att_10,att_11,att_12,att_13_-1.0,att_13_0.5,att_13_1.0,heart_disease
0,0.708333,1.0,1.0,-0.320755,-0.105023,0.0,1.0,-0.419847,0.0,-0.225806,0.0,1.0,1.0,0.0,0.0,1
1,0.583333,0.0,0.333333,-0.603774,1.0,0.0,1.0,0.358779,0.0,-0.483871,0.0,-1.0,0.0,0.0,1.0,0
2,0.166667,1.0,-0.333333,-0.433962,-0.383562,0.0,0.0,0.068702,0.0,-0.903226,0.0,-1.0,0.0,0.0,1.0,1
3,0.458333,1.0,1.0,-0.358491,-0.374429,0.0,0.0,-0.480916,1.0,-0.935484,0.0,-0.333333,0.0,0.0,1.0,0
4,0.875,0.0,-0.333333,-0.509434,-0.347032,0.0,1.0,-0.236641,1.0,-0.935484,0.0,-0.333333,1.0,0.0,0.0,0


In [42]:
heart_hs.describe(include="all")

Unnamed: 0,att_1,att_2,att_3,att_4,att_5,att_6,att_7,att_8,att_9,att_10,att_11,att_12,att_13_-1.0,att_13_0.5,att_13_1.0,heart_disease
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,0.059722,0.677778,0.449383,-0.295388,-0.435346,0.148148,0.507407,0.201187,0.32963,-0.66129,0.066667,-0.553086,0.562963,0.051852,0.385185,0.444444
std,0.379544,0.468195,0.633393,0.337012,0.23601,0.355906,0.500874,0.353675,0.470952,0.369423,0.249907,0.629264,0.496941,0.22214,0.487543,0.497827
min,-1.0,0.0,-1.0,-1.0,-1.0,0.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0
25%,-0.208333,0.0,0.333333,-0.509434,-0.60274,0.0,0.0,-0.053435,0.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0
50%,0.083333,1.0,0.333333,-0.320755,-0.456621,0.0,1.0,0.259542,0.0,-0.741935,0.0,-1.0,1.0,0.0,0.0,0.0
75%,0.333333,1.0,1.0,-0.132075,-0.296803,0.0,1.0,0.450382,1.0,-0.483871,0.0,-0.333333,1.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Breast cancer
Link: https://www.openml.org/search?type=data&status=active&sort=runs&id=13

Same: __TRUE__

At first glance the data sets seem different, but the authors of HS transform each feature into numeric and leave out values that are present in attribute definition but not in the data set. Authors also dropped all rows with unknown values.

In [223]:
BREAST_AGE_TRANSLATE = {
    "10-19": 0, "20-29": 1, "30-39": 2, "40-49": 3, "50-59": 4, "60-69": 5,
    "70-79": 6, "80-89": 7, "90-99": 8}
BREAST_TUMOR_TRANSLATE = {
    "0-4": 0, "5-9": 1, "10-14":2 , "15-19": 3, "20-24": 4, "25-29": 5, "30-34": 6,
    "35-39": 7, "40-44": 8, "45-49": 9, "50-54": 10, "55-59": 11}
BREAST_INV_TRANSLATE = {
    "0-2": 0, "3-5": 1, "6-8": 2, "9-11": 3, "12-14": 4, "15-17": 5, "18-20": 6,
    "21-23": 7, "24-26": 8, "27-29": 9, "30-32": 10, "33-35": 11, "36-39": 12}


breast_cancer = pd.DataFrame(columns=list(range(len(BREAST_COLS_TRANSLATE))))
with open(os.path.abspath("../data/classification/dataset_13_breast-cancer.arff"), "r") as f:
    header = list()
    for line in f:
        if line.startswith("@attribute"):
            header.append(line.split("'")[1])
        if line.startswith("%") or line.startswith("@"):
            continue
        params = line.strip().replace("'", "").split(",")
        breast_cancer = pd.concat([breast_cancer, pd.DataFrame.from_dict({i: [v] for i, v in enumerate(params)})], ignore_index=True)

breast_cancer = breast_cancer.rename({i: v for i, v in enumerate(header)}, axis=1)
breast_cancer = breast_cancer.replace("?", np.nan).dropna()
breast_cancer["age"] = breast_cancer["age"].apply(lambda x: BREAST_AGE_TRANSLATE[x])
breast_cancer["tumor-size"] = breast_cancer["tumor-size"].apply(lambda x: BREAST_TUMOR_TRANSLATE[x])
breast_cancer["inv-nodes"] = breast_cancer["inv-nodes"].apply(lambda x: BREAST_INV_TRANSLATE[x])
breast_cancer["node-caps"] = (breast_cancer["node-caps"] == "yes")*1
breast_cancer["breast"] = (breast_cancer["breast"] == "right")*1
breast_cancer["irradiat"] = (breast_cancer["irradiat"] == "yes")*1
breast_cancer["Class"] = (breast_cancer["Class"] == "recurrence-events")*1
breast_cancer = pd.get_dummies(breast_cancer, columns=["deg-malig", "menopause", "breast-quad"])
breast_cancer.head()

Unnamed: 0,age,tumor-size,inv-nodes,node-caps,breast,irradiat,Class,deg-malig_1,deg-malig_2,deg-malig_3,menopause_ge40,menopause_lt40,menopause_premeno,breast-quad_central,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up
0,3,3,0,1,1,0,1,0,0,1,0,0,1,0,0,1,0,0
1,4,3,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0
2,4,7,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0
3,3,7,0,1,1,1,0,0,0,1,0,0,1,0,1,0,0,0
4,3,6,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,1


In [225]:
breast_cancer.describe(include="all")

Unnamed: 0,age,tumor-size,inv-nodes,node-caps,breast,irradiat,Class,deg-malig_1,deg-malig_2,deg-malig_3,menopause_ge40,menopause_lt40,menopause_premeno,breast-quad_central,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up
count,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0
mean,3.642599,4.880866,0.501805,0.202166,0.476534,0.223827,0.292419,0.238267,0.465704,0.296029,0.444043,0.018051,0.537906,0.075812,0.382671,0.33935,0.083032,0.119134
std,1.010125,2.132452,1.137709,0.402342,0.500353,0.417562,0.455697,0.426794,0.499725,0.45733,0.497758,0.133375,0.499463,0.265177,0.486919,0.474346,0.276431,0.324532
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,4.0,6.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
max,6.0,10.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [215]:
X, y, cols = imodels.util.data_util.get_clean_dataset("breast_cancer", "imodels")
breast_cancer_hs = pd.DataFrame(X)
breast_cancer_hs = breast_cancer_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
breast_cancer_hs["cancer"] = y
breast_cancer_hs.head()

Unnamed: 0,age,tumor-size,inv-nodes,node-caps,breast,irradiat,menopause_0.0,menopause_1.0,menopause_2.0,breast-quad_0.0,breast-quad_1.0,breast-quad_2.0,breast-quad_3.0,breast-quad_4.0,deg-malig_0.0,deg-malig_1.0,deg-malig_2.0,cancer
0,3.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,4.0,3.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0
2,4.0,7.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
3,3.0,7.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,3.0,6.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1


In [216]:
breast_cancer_hs.describe()

Unnamed: 0,age,tumor-size,inv-nodes,node-caps,breast,irradiat,menopause_0.0,menopause_1.0,menopause_2.0,breast-quad_0.0,breast-quad_1.0,breast-quad_2.0,breast-quad_3.0,breast-quad_4.0,deg-malig_0.0,deg-malig_1.0,deg-malig_2.0,cancer
count,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0,277.0
mean,3.642599,4.880867,0.501805,0.797834,0.476534,0.776173,0.018051,0.444043,0.537906,0.33935,0.382671,0.119134,0.083032,0.075812,0.238267,0.465704,0.296029,0.292419
std,1.010125,2.132452,1.137709,0.402342,0.500353,0.417562,0.133375,0.497758,0.499463,0.474346,0.486919,0.324532,0.276431,0.265177,0.426794,0.499725,0.45733,0.455697
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,5.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.0,6.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
max,6.0,10.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Haberman
Link: https://archive.ics.uci.edu/ml/datasets/Haberman%27s+Survival

Same: __TRUE__

Exact match for the paper. The data set that the authors provide is the same, but they do subtract 58 from the column "Patients_year_of_operation".

In [116]:
HABERMAN_COLS_TRANSLATE = {
    0: "age", 1: "year_of_operation", 2: "positive_axillary_nodes_detected", 3: "survival"
}

haberman = pd.DataFrame(columns=list(range(len(HABERMAN_COLS_TRANSLATE))))
with open(os.path.abspath("../data/classification/haberman.data"), "r") as f:
    for line in f:
        params = line.strip().split(",")
        haberman = pd.concat([haberman, pd.DataFrame.from_dict({i: [v] for i, v in enumerate(params)})], ignore_index=True)

haberman = haberman.rename(HABERMAN_COLS_TRANSLATE, axis=1)
haberman = haberman.astype(int)
haberman.head()

Unnamed: 0,age,year_of_operation,positive_axillary_nodes_detected,survival
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


In [119]:
haberman.describe()

Unnamed: 0,age,year_of_operation,positive_axillary_nodes_detected,survival
count,306.0,306.0,306.0,306.0
mean,52.457516,62.852941,4.026144,1.264706
std,10.803452,3.249405,7.189654,0.441899
min,30.0,58.0,0.0,1.0
25%,44.0,60.0,0.0,1.0
50%,52.0,63.0,1.0,1.0
75%,60.75,65.75,4.0,2.0
max,83.0,69.0,52.0,2.0


In [117]:
X, y, cols = imodels.util.data_util.get_clean_dataset("haberman", "imodels")
haberman_hs = pd.DataFrame(X)
haberman_hs = haberman_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
haberman_hs["survival"] = y
haberman_hs.head()

Unnamed: 0,Age_of_patient_at_time_of_operation,Patients_year_of_operation,Number_of_positive_axillary_nodes_detected,survival
0,30.0,6.0,1.0,1
1,30.0,4.0,3.0,1
2,30.0,7.0,0.0,1
3,31.0,1.0,2.0,1
4,31.0,7.0,4.0,1


In [118]:
haberman_hs.describe()

Unnamed: 0,Age_of_patient_at_time_of_operation,Patients_year_of_operation,Number_of_positive_axillary_nodes_detected,survival
count,306.0,306.0,306.0,306.0
mean,52.457516,4.852941,4.026144,0.735294
std,10.803452,3.249405,7.189653,0.441899
min,30.0,0.0,0.0,0.0
25%,44.0,2.0,0.0,0.0
50%,52.0,5.0,1.0,1.0
75%,60.75,7.75,4.0,1.0
max,83.0,11.0,52.0,1.0


## Ionosphere
Link: https://archive.ics.uci.edu/ml/datasets/Ionosphere

Same: __TRUE__

Exact match for the paper.

In [127]:
IONOSPHERE_COLS_TRANSLATE = {
    i: f"attr_{i}" for i in range(34)
}
IONOSPHERE_COLS_TRANSLATE[34] = "ionosphere"

ionosphere = pd.DataFrame(columns=list(range(len(IONOSPHERE_COLS_TRANSLATE))))
with open(os.path.abspath("../data/classification/ionosphere.data"), "r") as f:
    for line in f:
        params = line.strip().split(",")
        ionosphere = pd.concat([ionosphere, pd.DataFrame.from_dict({i: [v] for i, v in enumerate(params)})], ignore_index=True)

ionosphere = ionosphere.rename(IONOSPHERE_COLS_TRANSLATE, axis=1)
ionosphere["ionosphere"] = ionosphere["ionosphere"] == "g"
ionosphere.head()

Unnamed: 0,attr_0,attr_1,attr_2,attr_3,attr_4,attr_5,attr_6,attr_7,attr_8,attr_9,...,attr_25,attr_26,attr_27,attr_28,attr_29,attr_30,attr_31,attr_32,attr_33,ionosphere
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,True
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,False
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,True
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,False
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,True


In [126]:
X, y, cols = imodels.util.data_util.get_clean_dataset("ionosphere", "pmlb")
ionosphere_hs = pd.DataFrame(X)
ionosphere_hs = ionosphere_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
ionosphere_hs["ionosphere"] = y
ionosphere_hs.head()

Unnamed: 0,X_0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,...,X_25,X_26,X_27,X_28,X_29,X_30,X_31,X_32,X_33,ionosphere
0,1.0,0.0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,1
1,1.0,0.0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,0
2,1.0,0.0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,1
3,1.0,0.0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,0
4,1.0,0.0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,1


## Diabetes
Link: https://www.kaggle.com/datasets/mathchi/diabetes-data-set

Same: __TRUE__

Exact match for the paper. The rows are scrambled, but we can see that the data distribution is the same for all columns in both data sets.

Note: the data set was found on Kaggle and not on the UCI as the authors of Random Forest paper state.

In [128]:
diabetes = pd.read_csv(os.path.abspath("../data/classification/diabetes.csv"))
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [131]:
diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [130]:
X, y, cols = imodels.util.data_util.get_clean_dataset("diabetes", "pmlb")
diabetes_hs = pd.DataFrame(X)
diabetes_hs = diabetes_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
diabetes_hs["diabetes"] = y
diabetes_hs.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,diabetes
0,9.0,140.0,94.0,0.0,0.0,32.7,0.734,45.0,1
1,2.0,108.0,80.0,0.0,0.0,27.0,0.259,52.0,1
2,1.0,128.0,48.0,45.0,194.0,40.5,0.613,24.0,1
3,5.0,130.0,82.0,0.0,0.0,39.1,0.956,37.0,1
4,0.0,121.0,66.0,30.0,165.0,34.3,0.203,33.0,1


In [132]:
diabetes_hs.describe()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,diabetes
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## German credit
Link: https://archive.ics.uci.edu/ml/datasets/South+German+Credit+%28UPDATE%29

Same: __FALSE__

Has the same number of features and instances, features also coinside with the german names. But the data distribution of the majority of variables is different. Here are variables with the same distributions:
- duration
- amount/credit
- installment rate
- age
- number credits/existing credits

Target variable is the same. Most of the other variables seem to have different range (found data set has column range from 1 to 5, data set from authors of HS have from 0 to 4). Even if we scale the data set back, there is still difference between the distributions.

In [157]:
GERMAN_COLS_TRANSLATE = [
    "status", "duration", "credit_history", "purpose", "amount", "savings",
    "employment_duration", "installment_rate", "personal_status_sex",
    "other_debtors", "present_residence", "property", "age",
    "other_installment_plans", "housing", "number_credits", "job", "people_liable",
    "telephone", "foreign_worker", "credit_risk"]
GERMAN_COLS_TRANSLATE = {i: v for i, v in enumerate(GERMAN_COLS_TRANSLATE)}

german_credit = pd.DataFrame(columns=list(range(len(GERMAN_COLS_TRANSLATE))))
with open(os.path.abspath("../data/classification/SouthGermanCredit.asc"), "r") as f:
    _ = f.readline().strip().split(" ") #skip header, which is in German
    for line in f:
        params = line.strip().split(" ")
        german_credit = pd.concat([german_credit, pd.DataFrame.from_dict({i: [v] for i, v in enumerate(params)})], ignore_index=True)
german_credit = german_credit.rename(GERMAN_COLS_TRANSLATE, axis=1)
german_credit = german_credit.astype(int)
german_credit.head()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,1,18,4,2,1049,1,2,4,2,1,...,2,21,3,1,1,3,2,1,2,1
1,1,9,4,0,2799,1,3,2,3,1,...,1,36,3,1,2,3,1,1,2,1
2,2,12,2,9,841,2,4,2,2,1,...,1,23,3,1,1,2,2,1,2,1
3,1,12,4,0,2122,1,3,3,3,1,...,1,39,3,1,2,2,1,1,1,1
4,1,12,4,0,2171,1,3,4,3,1,...,2,38,1,2,2,2,2,1,1,1


In [160]:
german_credit.describe()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,2.577,20.903,2.545,2.828,3271.248,2.105,3.384,2.973,2.682,1.145,...,2.358,35.542,2.675,1.928,1.407,2.904,1.845,1.404,1.963,0.7
std,1.257638,12.058814,1.08312,2.744439,2822.75176,1.580023,1.208306,1.118715,0.70808,0.477706,...,1.050209,11.35267,0.705601,0.530186,0.577654,0.653614,0.362086,0.490943,0.188856,0.458487
min,1.0,4.0,0.0,0.0,250.0,1.0,1.0,1.0,1.0,1.0,...,1.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,1.0,12.0,2.0,1.0,1365.5,1.0,3.0,2.0,2.0,1.0,...,1.0,27.0,3.0,2.0,1.0,3.0,2.0,1.0,2.0,0.0
50%,2.0,18.0,2.0,2.0,2319.5,1.0,3.0,3.0,3.0,1.0,...,2.0,33.0,3.0,2.0,1.0,3.0,2.0,1.0,2.0,1.0
75%,4.0,24.0,4.0,3.0,3972.25,3.0,5.0,4.0,3.0,1.0,...,3.0,42.0,3.0,2.0,2.0,3.0,2.0,2.0,2.0,1.0
max,4.0,72.0,4.0,10.0,18424.0,5.0,5.0,4.0,4.0,3.0,...,4.0,75.0,3.0,3.0,4.0,4.0,2.0,2.0,2.0,1.0


In [144]:
X, y, cols = imodels.util.data_util.get_clean_dataset("german", "pmlb")
german_credit_hs = pd.DataFrame(X)
german_credit_hs = german_credit_hs.rename({i: v for i, v in enumerate(cols)}, axis=1)
german_credit_hs["credit"] = y
german_credit_hs.head()

Unnamed: 0,Status,Duration,Credit-history,Purpose,Credit,Savings-account,Employment,Installment-rate,Personal-status,Debtors,...,Property,Age,Installments,Housing,Existing-credits,Job,Liable-people,Telephone,Foreign,credit
0,2.0,42.0,4.0,6.0,7166.0,4.0,3.0,2.0,2.0,2.0,...,0.0,29.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1
1,2.0,18.0,4.0,6.0,1126.0,4.0,1.0,4.0,0.0,2.0,...,3.0,21.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1
2,1.0,24.0,4.0,3.0,4351.0,4.0,0.0,1.0,0.0,2.0,...,0.0,48.0,1.0,1.0,1.0,3.0,1.0,0.0,1.0,1
3,0.0,12.0,4.0,2.0,1200.0,4.0,0.0,4.0,0.0,2.0,...,0.0,23.0,0.0,2.0,1.0,1.0,1.0,0.0,1.0,1
4,2.0,12.0,4.0,6.0,1963.0,1.0,3.0,4.0,3.0,2.0,...,1.0,31.0,1.0,2.0,2.0,0.0,2.0,0.0,1.0,1


In [163]:
german_credit_hs.describe()

Unnamed: 0,Status,Duration,Credit-history,Purpose,Credit,Savings-account,Employment,Installment-rate,Personal-status,Debtors,...,Property,Age,Installments,Housing,Existing-credits,Job,Liable-people,Telephone,Foreign,credit
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1.246,20.903,3.019,4.484,3271.258,1.685,1.448,2.973,1.878,1.866,...,1.486,35.546,0.908,1.071,1.407,1.274,1.155,0.596,0.963,0.7
std,0.927547,12.058814,1.174742,2.421075,2822.736876,1.239884,1.283333,1.118715,1.350904,0.445244,...,1.130966,11.375469,0.421561,0.531264,0.577654,0.946478,0.362086,0.490943,0.188856,0.458487
min,0.0,4.0,0.0,0.0,250.0,0.0,0.0,1.0,0.0,0.0,...,0.0,19.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
25%,0.0,12.0,2.0,3.0,1365.5,1.0,0.0,2.0,0.0,2.0,...,1.0,27.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
50%,1.0,18.0,4.0,4.0,2319.5,1.0,1.0,3.0,3.0,2.0,...,1.0,33.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,2.0,24.0,4.0,6.0,3972.25,2.0,2.0,4.0,3.0,2.0,...,3.0,42.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
max,3.0,72.0,4.0,9.0,18424.0,4.0,4.0,4.0,3.0,2.0,...,3.0,75.0,2.0,2.0,4.0,3.0,2.0,1.0,1.0,1.0


## Juvenile
Link: NOT FOUND

Same: __FALSE__

## Recidivism
Link: NOT FOUND

Same: __FALSE__