
<center><img src='../img/data_explorer.png' alt='Data Explorer' height='300'/></center>


# Initial Exploration and Selection of Medical Datasets for Machine Learning Project

This notebook serves as an initial exploration of various medical datasets to identify the most relevant ones for a machine learning project. 

I will conduct a preliminary quick analysis to understand each dataset's structure, data quality, variable types, and any relevant attributes that may be critical for predictive modeling. By examining features such as data completeness, feature correlations, and target distribution. This exploratory phase will provide a foundation for a datasets selection and streamline the subsequent data preprocessing and modeling stages. Only one dataset will be selected for this project, but the remaining datasets are available for potential use in future researchs.

In [1]:
import os
import sys
import zipfile

import kagglehub
import pandas as pd
import pyreadstat
import requests

from ucimlrepo import fetch_ucirepo

# add "src" path
root_path = os.path.abspath(os.path.join(os.getcwd(), '..')) 
if os.path.exists(root_path) and root_path not in sys.path: 
    sys.path.append(root_path)
                        
from utils import datascience as ds

  from .autonotebook import tqdm as notebook_tqdm


# Breast Cancer Wisconsin (Diagnostic)

Wolberg, W., Mangasarian, O., Street, N., & Street, W. (1993). Breast Cancer Wisconsin (Diagnostic Dataset). [UCI Machine Learning Repository](https://doi.org/10.24432/C5DW2B).

In [2]:
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets 
  
print(breast_cancer_wisconsin_diagnostic.metadata) 
display(X.head())
display(breast_cancer_wisconsin_diagnostic.variables) 
ds.get_cardinality(X)


{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'ID': 230, 'type': 'NATIVE', 'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'venue': 'Electronic imaging', 'year': 1993, 'journal': None, 'DOI': '1

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,ID,ID,Categorical,,,,no
1,Diagnosis,Target,Categorical,,,,no
2,radius1,Feature,Continuous,,,,no
3,texture1,Feature,Continuous,,,,no
4,perimeter1,Feature,Continuous,,,,no
5,area1,Feature,Continuous,,,,no
6,smoothness1,Feature,Continuous,,,,no
7,compactness1,Feature,Continuous,,,,no
8,concavity1,Feature,Continuous,,,,no
9,concave_points1,Feature,Continuous,,,,no


pandas.DataFrame shape: (569, 30)


Unnamed: 0,Card,%_Card,NaN_Values,%_NaN_Values,Type,Class
radius1,456,80.140598,0,0.0,float64,Numeric - Continuous
texture1,479,84.182777,0,0.0,float64,Numeric - Continuous
perimeter1,522,91.739895,0,0.0,float64,Numeric - Continuous
area1,539,94.727592,0,0.0,float64,Numeric - Continuous
smoothness1,474,83.304042,0,0.0,float64,Numeric - Continuous
compactness1,537,94.376098,0,0.0,float64,Numeric - Continuous
concavity1,537,94.376098,0,0.0,float64,Numeric - Continuous
concave_points1,542,95.254833,0,0.0,float64,Numeric - Continuous
symmetry1,432,75.922671,0,0.0,float64,Numeric - Continuous
fractal_dimension1,499,87.697715,0,0.0,float64,Numeric - Continuous


# Breast Cancer Wisconsin (Original)

W. Wolberg. "Breast Cancer Wisconsin (Original)," UCI Machine Learning Repository, 1990. [Online]. Available: https://doi.org/10.24432/C5HP4Z.

In [3]:
 # fetch dataset 
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_original.data.features 
y = breast_cancer_wisconsin_original.data.targets 
  
print(breast_cancer_wisconsin_original.metadata) 
display(X.head())
display(breast_cancer_wisconsin_original.variables) 
ds.get_cardinality(X)

{'uci_id': 15, 'name': 'Breast Cancer Wisconsin (Original)', 'repository_url': 'https://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+original', 'data_url': 'https://archive.ics.uci.edu/static/public/15/data.csv', 'abstract': 'Original Wisconsin Breast Cancer Database', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 699, 'num_features': 9, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['Class'], 'index_col': ['Sample_code_number'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1990, 'last_updated': 'Sun Mar 10 2024', 'dataset_doi': '10.24432/C5HP4Z', 'creators': ['WIlliam Wolberg'], 'intro_paper': None, 'additional_info': {'summary': "Samples arrive periodically as Dr. Wolberg reports his clinical cases. The database therefore reflects this chronological grouping of the data. This grouping information appears immediately below, having been removed fro

Unnamed: 0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses
0,5,1,1,1,2,1.0,3,1,1
1,5,4,4,5,7,10.0,3,2,1
2,3,1,1,1,2,2.0,3,1,1
3,6,8,8,1,3,4.0,3,7,1
4,4,1,1,3,2,1.0,3,1,1


Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,Sample_code_number,ID,Categorical,,,,no
1,Clump_thickness,Feature,Integer,,,,no
2,Uniformity_of_cell_size,Feature,Integer,,,,no
3,Uniformity_of_cell_shape,Feature,Integer,,,,no
4,Marginal_adhesion,Feature,Integer,,,,no
5,Single_epithelial_cell_size,Feature,Integer,,,,no
6,Bare_nuclei,Feature,Integer,,,,yes
7,Bland_chromatin,Feature,Integer,,,,no
8,Normal_nucleoli,Feature,Integer,,,,no
9,Mitoses,Feature,Integer,,,,no


pandas.DataFrame shape: (699, 9)


Unnamed: 0,Card,%_Card,NaN_Values,%_NaN_Values,Type,Class
Clump_thickness,10,1.430615,0,0.0,int64,Numeric - Discrete
Uniformity_of_cell_size,10,1.430615,0,0.0,int64,Numeric - Discrete
Uniformity_of_cell_shape,10,1.430615,0,0.0,int64,Numeric - Discrete
Marginal_adhesion,10,1.430615,0,0.0,int64,Numeric - Discrete
Single_epithelial_cell_size,10,1.430615,0,0.0,int64,Numeric - Discrete
Bare_nuclei,10,1.430615,16,2.288984,float64,Numeric - Discrete
Bland_chromatin,10,1.430615,0,0.0,int64,Numeric - Discrete
Normal_nucleoli,10,1.430615,0,0.0,int64,Numeric - Discrete
Mitoses,9,1.287554,0,0.0,int64,Categoric


# 2022 BRFSS (Behavioral Risk Factor Surveillance System) Survey Data and Documentation

https://www.cdc.gov/brfss/annual_data/annual_2020.html

https://www.cdc.gov/brfss/annual_data/annual_2022.html


In [4]:
# 2020 year

URL = 'https://www.cdc.gov/brfss/annual_data/2020/files/LLCP2020XPT.zip' 
DATA_RAW = '../data/raw/'
ZIP_FOLDER = 'LLCP2020XPT.zip'

unzip_folder = ZIP_FOLDER[:-4]
fname = ZIP_FOLDER[:-7] + '.XPT'
zip_folder_path = DATA_RAW + ZIP_FOLDER
unzip_folder_path = DATA_RAW + unzip_folder
file_path = unzip_folder_path + '/' + fname

# print(zip_folder_path)
# print(unzip_folder_path)
# print(file_path)

response = requests.get(URL)
with open(zip_folder_path, 'wb') as file:
    file.write(response.content)

with zipfile.ZipFile(zip_folder_path, 'r') as zip_ref:
    zip_ref.extractall(unzip_folder_path) 

df_brfss_2020, meta = pyreadstat.read_xport(file_path, encoding='latin1')

display(df_brfss_2020)

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_RFPSA23,_CLNSCPY,_SGMSCPY,_SGMS10Y,_RFBLDS4,_STOLDNA,_VIRCOLN,_SBONTIM,_CRCREC1,_AIDTST4
0,1.0,1.0,01042020,01,04,2020,1100.0,2020000001,2.020000e+09,1.0,...,,1.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,1.0
1,1.0,1.0,02072020,02,07,2020,1200.0,2020000002,2.020000e+09,1.0,...,,,,,,,,2.0,,
2,1.0,1.0,01232020,01,23,2020,1100.0,2020000003,2.020000e+09,1.0,...,,1.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,2.0
3,1.0,1.0,01092020,01,09,2020,1100.0,2020000004,2.020000e+09,1.0,...,,,,,,,,,,2.0
4,1.0,1.0,01042020,01,04,2020,1100.0,2020000005,2.020000e+09,1.0,...,,,,,,,,,,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401953,72.0,11.0,02192021,02,19,2021,1100.0,2020004940,2.020005e+09,,...,,,,,,,,,,1.0
401954,72.0,11.0,02142021,02,14,2021,1100.0,2020004941,2.020005e+09,,...,,,,,,,,,,1.0
401955,72.0,11.0,02142021,02,14,2021,1100.0,2020004942,2.020005e+09,,...,,,,,,,,,,2.0
401956,72.0,11.0,03172021,03,17,2021,1100.0,2020004943,2.020005e+09,,...,1.0,1.0,3.0,3.0,1.0,3.0,3.0,2.0,1.0,1.0


In [5]:
# 2022 year

URL = 'https://www.cdc.gov/brfss/annual_data/2022/files/LLCP2022XPT.zip' 
DATA_RAW = '../data/raw/'
ZIP_FOLDER = 'LLCP2022XPT.zip'

unzip_folder = ZIP_FOLDER[:-4]
fname = ZIP_FOLDER[:-7] + '.XPT'
zip_folder_path = DATA_RAW + ZIP_FOLDER
unzip_folder_path = DATA_RAW + unzip_folder
file_path = unzip_folder_path + '/' + fname

response = requests.get(URL)
with open(zip_folder_path, 'wb') as file:
    file.write(response.content)

with zipfile.ZipFile(zip_folder_path, 'r') as zip_ref:
    zip_ref.extractall(unzip_folder_path) 

df_brfss_2022, meta = pyreadstat.read_xport(file_path, encoding='latin1')

display(df_brfss_2022)

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_SMOKGRP,_LCSREC,DRNKANY6,DROCDY4_,_RFBING6,_DRNKWK2,_RFDRHV8,_FLSHOT7,_PNEUMO3,_AIDTST4
0,1.0,1.0,02032022,02,03,2022,1100.0,2022000001,2.022000e+09,1.0,...,4.0,,2.0,0.0,1.0,0.0,1.0,1.0,2.0,2.0
1,1.0,1.0,02042022,02,04,2022,1100.0,2022000002,2.022000e+09,1.0,...,4.0,,2.0,0.0,1.0,0.0,1.0,2.0,2.0,2.0
2,1.0,1.0,02022022,02,02,2022,1100.0,2022000003,2.022000e+09,1.0,...,4.0,,2.0,0.0,1.0,0.0,1.0,,,2.0
3,1.0,1.0,02032022,02,03,2022,1100.0,2022000004,2.022000e+09,1.0,...,3.0,2.0,2.0,0.0,1.0,0.0,1.0,9.0,9.0,2.0
4,1.0,1.0,02022022,02,02,2022,1100.0,2022000005,2.022000e+09,1.0,...,4.0,,1.0,10.0,1.0,140.0,1.0,,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445127,78.0,11.0,12192022,12,19,2022,1100.0,2022001527,2.022002e+09,,...,4.0,,7.0,900.0,9.0,99900.0,9.0,,,1.0
445128,78.0,11.0,12212022,12,21,2022,1100.0,2022001528,2.022002e+09,,...,4.0,,2.0,0.0,1.0,0.0,1.0,,,1.0
445129,78.0,11.0,11292022,11,29,2022,1100.0,2022001529,2.022002e+09,,...,1.0,,7.0,900.0,9.0,99900.0,9.0,2.0,2.0,2.0
445130,78.0,11.0,12082022,12,08,2022,1100.0,2022001530,2.022002e+09,,...,4.0,,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0


In [6]:
# cardinality and values of variables used to categorize or classify respondents (begin with "_")
i = 0
for column in df_brfss_2022.columns:
    if column.startswith('_'):
        print(df_brfss_2022[column].value_counts())
        print()
        i += 1

print(i)


_STATE
53.0    26152
36.0    17800
27.0    16821
39.0    16487
24.0    16418
48.0    14245
12.0    13393
55.0    11276
20.0    11247
25.0    11029
6.0     10952
23.0    10646
18.0    10466
51.0    10417
4.0     10185
26.0    10058
45.0    10037
49.0     9826
9.0      9784
8.0      9365
13.0     9236
19.0     8949
50.0     8811
34.0     8209
15.0     7747
31.0     7473
29.0     7438
46.0     7424
30.0     7048
33.0     6757
16.0     6280
44.0     5893
2.0      5865
40.0     5775
41.0     5756
22.0     5629
72.0     5509
5.0      5309
47.0     5266
54.0     4981
35.0     4758
42.0     4582
1.0      4506
37.0     4505
28.0     4239
38.0     4153
56.0     4142
17.0     4056
21.0     4023
10.0     3987
11.0     3237
32.0     3188
66.0     2266
78.0     1531
Name: count, dtype: int64

_PSU
2.022000e+09    54
2.022000e+09    54
2.022000e+09    54
2.022000e+09    54
2.022000e+09    54
                ..
2.022026e+09     1
2.022026e+09     1
2.022026e+09     1
2.022026e+09     1
2.022026e+09   

In [7]:
meta.column_labels

['STATE FIPS CODE',
 'FILE MONTH',
 'INTERVIEW DATE',
 'INTERVIEW MONTH',
 'INTERVIEW DAY',
 'INTERVIEW YEAR',
 'FINAL DISPOSITION',
 'ANNUAL SEQUENCE NUMBER',
 'PRIMARY SAMPLING UNIT',
 'CORRECT TELEPHONE NUMBER?',
 'PRIVATE RESIDENCE?',
 'DO YOU LIVE IN COLLEGE HOUSING?',
 'RESIDENT OF STATE',
 'CELLULAR TELEPHONE',
 'ARE YOU 18 YEARS OF AGE OR OLDER?',
 'ARE YOU MALE OR FEMALE?',
 'NUMBER OF ADULTS IN HOUSEHOLD',
 'ARE YOU MALE OR FEMALE?',
 'NUMBER OF ADULT MEN IN HOUSEHOLD',
 'NUMBER OF ADULT WOMEN IN HOUSEHOLD',
 'RESPONDENT SELECTION',
 'SAFE TIME TO TALK?',
 'CORRECT PHONE NUMBER?',
 'IS THIS A CELL PHONE?',
 'ARE YOU 18 YEARS OF AGE OR OLDER?',
 'ARE YOU MALE OR FEMALE?',
 'DO YOU LIVE IN A PRIVATE RESIDENCE?',
 'DO YOU LIVE IN COLLEGE HOUSING?',
 'DO YOU CURRENTLY LIVE IN  ____(STATE)___',
 'DO YOU ALSO HAVE A LANDLINE TELEPHONE?',
 'NUMBER OF ADULTS IN HOUSEHOLD',
 'SEX OF RESPONDENT',
 'GENERAL HEALTH',
 'NUMBER OF DAYS PHYSICAL HEALTH NOT GOOD',
 'NUMBER OF DAYS MENTAL HEA

# Kaggle Dataset: Indicators of Heart Disease (2022)

https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease/data

In [8]:
# Download latest version
path = kagglehub.dataset_download('kamilpytlak/personal-key-indicators-of-heart-disease')

print("Path to dataset files:", path)

Path to dataset files: C:\Users\Lander\.cache\kagglehub\datasets\kamilpytlak\personal-key-indicators-of-heart-disease\versions\6


In [None]:
data_path = '../data/raw/heart_disease/2020/heart_2020_cleaned.csv'
df_heart_2020 = pd.read_csv(data_path)

display(df_heart_2020.head())
print(df_heart_2020['HeartDisease'].value_counts(), end='\n\n')
ds.get_cardinality(df_heart_2020)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


HeartDisease
No     292422
Yes     27373
Name: count, dtype: int64

pandas.DataFrame shape: (319795, 18)


Unnamed: 0,Card,%_Card,NaN_Values,%_NaN_Values,Type,Class
HeartDisease,2,0.000625,0,0.0,object,Binary
BMI,3604,1.126972,0,0.0,float64,Numeric - Discrete
Smoking,2,0.000625,0,0.0,object,Binary
AlcoholDrinking,2,0.000625,0,0.0,object,Binary
Stroke,2,0.000625,0,0.0,object,Binary
PhysicalHealth,31,0.009694,0,0.0,float64,Numeric - Discrete
MentalHealth,31,0.009694,0,0.0,float64,Numeric - Discrete
DiffWalking,2,0.000625,0,0.0,object,Binary
Sex,2,0.000625,0,0.0,object,Binary
AgeCategory,13,0.004065,0,0.0,object,Numeric - Discrete


In [None]:
data_path = '../data/raw/heart_disease/2022/heart_2022_no_nans.csv'
df_heart_2022_clean = pd.read_csv(data_path)

display(df_heart_2022_clean.head())
print(df_heart_2022_clean['HadHeartAttack'].value_counts(), end='\n\n')
ds.get_cardinality(df_heart_2022_clean)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.6,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.7,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No


HadHeartAttack
No     232587
Yes     13435
Name: count, dtype: int64

pandas.DataFrame shape: (246022, 40)


Unnamed: 0,Card,%_Card,NaN_Values,%_NaN_Values,Type,Class
State,54,0.021949,0,0.0,object,Numeric - Discrete
Sex,2,0.000813,0,0.0,object,Binary
GeneralHealth,5,0.002032,0,0.0,object,Categoric
PhysicalHealthDays,31,0.0126,0,0.0,float64,Numeric - Discrete
MentalHealthDays,31,0.0126,0,0.0,float64,Numeric - Discrete
LastCheckupTime,4,0.001626,0,0.0,object,Categoric
PhysicalActivities,2,0.000813,0,0.0,object,Binary
SleepHours,23,0.009349,0,0.0,float64,Numeric - Discrete
RemovedTeeth,4,0.001626,0,0.0,object,Categoric
HadHeartAttack,2,0.000813,0,0.0,object,Binary


# Heart Disease

Janosi, A., Steinbrunn, W., Pfisterer, M., & Detrano, R. (1989). Heart Disease (Dataset). [UCI Machine Learning Repository](https://doi.org/10.24432/C52P4X).

In [12]:
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets 
  
print(heart_disease.metadata) 
display(X.head())
display(heart_disease.variables) 
ds.get_cardinality(X)

{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'ID': 231, 'type': 'NATIVE', 'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0


Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,age,Feature,Integer,Age,,years,no
1,sex,Feature,Categorical,Sex,,,no
2,cp,Feature,Categorical,,,,no
3,trestbps,Feature,Integer,,resting blood pressure (on admission to the ho...,mm Hg,no
4,chol,Feature,Integer,,serum cholestoral,mg/dl,no
5,fbs,Feature,Categorical,,fasting blood sugar > 120 mg/dl,,no
6,restecg,Feature,Categorical,,,,no
7,thalach,Feature,Integer,,maximum heart rate achieved,,no
8,exang,Feature,Categorical,,exercise induced angina,,no
9,oldpeak,Feature,Integer,,ST depression induced by exercise relative to ...,,no


pandas.DataFrame shape: (303, 13)


Unnamed: 0,Card,%_Card,NaN_Values,%_NaN_Values,Type,Class
age,41,13.531353,0,0.0,int64,Numeric - Discrete
sex,2,0.660066,0,0.0,int64,Binary
cp,4,1.320132,0,0.0,int64,Categoric
trestbps,50,16.50165,0,0.0,int64,Numeric - Discrete
chol,152,50.165017,0,0.0,int64,Numeric - Continuous
fbs,2,0.660066,0,0.0,int64,Binary
restecg,3,0.990099,0,0.0,int64,Categoric
thalach,91,30.033003,0,0.0,int64,Numeric - Continuous
exang,2,0.660066,0,0.0,int64,Binary
oldpeak,40,13.20132,0,0.0,float64,Numeric - Discrete


# CDC Diabetes Health Indicators

https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators

In [13]:
# fetch dataset 
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 
  
# data (as pandas dataframes) 
X = cdc_diabetes_health_indicators.data.features 
y = cdc_diabetes_health_indicators.data.targets 

pd.options.display.max_colwidth = None
print(cdc_diabetes_health_indicators.metadata) 
display(X.head())
display(cdc_diabetes_health_indicators.variables) 
ds.get_cardinality(X)


{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1,1,1,40,1,0,0,0,0,1,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,25,1,0,0,1,0,0,...,0,1,3,0,0,0,0,7,6,1
2,1,1,1,28,0,0,0,0,1,0,...,1,1,5,30,30,1,0,9,4,8
3,1,0,1,27,0,0,0,1,1,1,...,1,0,2,0,0,0,0,11,3,6
4,1,1,1,24,0,0,0,1,1,1,...,1,0,2,3,0,0,0,11,5,4


Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,ID,ID,Integer,,Patient ID,,no
1,Diabetes_binary,Target,Binary,,0 = no diabetes 1 = prediabetes or diabetes,,no
2,HighBP,Feature,Binary,,0 = no high BP 1 = high BP,,no
3,HighChol,Feature,Binary,,0 = no high cholesterol 1 = high cholesterol,,no
4,CholCheck,Feature,Binary,,0 = no cholesterol check in 5 years 1 = yes cholesterol check in 5 years,,no
5,BMI,Feature,Integer,,Body Mass Index,,no
6,Smoker,Feature,Binary,,Have you smoked at least 100 cigarettes in your entire life? [Note: 5 packs = 100 cigarettes] 0 = no 1 = yes,,no
7,Stroke,Feature,Binary,,(Ever told) you had a stroke. 0 = no 1 = yes,,no
8,HeartDiseaseorAttack,Feature,Binary,,coronary heart disease (CHD) or myocardial infarction (MI) 0 = no 1 = yes,,no
9,PhysActivity,Feature,Binary,,physical activity in past 30 days - not including job 0 = no 1 = yes,,no


pandas.DataFrame shape: (253680, 21)


Unnamed: 0,Card,%_Card,NaN_Values,%_NaN_Values,Type,Class
HighBP,2,0.000788,0,0.0,int64,Binary
HighChol,2,0.000788,0,0.0,int64,Binary
CholCheck,2,0.000788,0,0.0,int64,Binary
BMI,84,0.033113,0,0.0,int64,Numeric - Discrete
Smoker,2,0.000788,0,0.0,int64,Binary
Stroke,2,0.000788,0,0.0,int64,Binary
HeartDiseaseorAttack,2,0.000788,0,0.0,int64,Binary
PhysActivity,2,0.000788,0,0.0,int64,Binary
Fruits,2,0.000788,0,0.0,int64,Binary
Veggies,2,0.000788,0,0.0,int64,Binary


In [14]:
y.value_counts()

Diabetes_binary
0                  218334
1                   35346
Name: count, dtype: int64

# Myocardial Infarction Complications

Golovenkin, S., Shulman, V., Rossiev, D., Shesternya, P., Nikulina, S., Orlova, Y., & Voino-Yasenetsky, V. (2020). Myocardial infarction complications [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C53P5M.



In [15]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
myocardial_infarction_complications = fetch_ucirepo(id=579) 
  
# data (as pandas dataframes) 
X = myocardial_infarction_complications.data.features 
y = myocardial_infarction_complications.data.targets 
  
print(myocardial_infarction_complications.metadata) 
display(X.head())
display(myocardial_infarction_complications.variables) 
ds.get_cardinality(X)

{'uci_id': 579, 'name': 'Myocardial infarction complications', 'repository_url': 'https://archive.ics.uci.edu/dataset/579/myocardial+infarction+complications', 'data_url': 'https://archive.ics.uci.edu/static/public/579/data.csv', 'abstract': 'Prediction of myocardial infarction complications', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1700, 'num_features': 111, 'feature_types': ['Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['FIBR_PREDS', 'PREDS_TAH', 'JELUD_TAH', 'FIBR_JELUD', 'A_V_BLOK', 'OTEK_LANC', 'RAZRIV', 'DRESSLER', 'ZSN', 'REC_IM', 'P_IM_STEN', 'LET_IS'], 'index_col': ['ID'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2020, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C53P5M', 'creators': ['S.E. Golovenkin', 'V.A. Shulman', 'D.A. Rossiev', 'P.A. Shesternya', 'S.Yu. Nikulina', 'Yu.V. Orlova', 'V.F. Voino-Yasenetsky'], 'intro_paper': {'ID'

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,IBS_NASL,GB,SIM_GIPERT,DLIT_AG,...,NOT_NA_1_n,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n
0,77.0,1,2.0,1.0,1.0,2.0,,3.0,0.0,7.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,55.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
2,52.0,1,0.0,0.0,0.0,2.0,,2.0,0.0,2.0,...,3.0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
3,68.0,0,0.0,0.0,0.0,2.0,,2.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
4,60.0,1,0.0,0.0,0.0,2.0,,3.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,ID,ID,Integer,,Record ID (ID): Unique identifier. Cannot be related to participant. It can be used for reference only.,,no
1,AGE,Feature,Integer,Age,Age of patient.,,no
2,SEX,Feature,Binary,Sex,"0: female, 1: male",,no
3,INF_ANAM,Feature,Categorical,,Quantity of myocardial infarctions in the anamnesis. \n\n0: zero\n\n1: one\n\n2: two\n\n3: three and more,,yes
4,STENOK_AN,Feature,Categorical,,Exertional angina pectoris in the anamnesis. \n\n0: never\n\n1: during the last year \n\n2: one year ago\n\n3: two years ago\n\n4: three years ago\n\n5: 4-5 years ago,,yes
...,...,...,...,...,...,...,...
119,DRESSLER,Target,Binary,,Dressler syndrome,,no
120,ZSN,Target,Binary,,Chronic heart failure,,no
121,REC_IM,Target,Binary,,Relapse of the myocardial infarction,,no
122,P_IM_STEN,Target,Binary,,Post-infarction angina,,no


pandas.DataFrame shape: (1700, 111)


Unnamed: 0,Card,%_Card,NaN_Values,%_NaN_Values,Type,Class
AGE,62,3.647059,8,0.470588,float64,Numeric - Discrete
SEX,2,0.117647,0,0.0,int64,Binary
INF_ANAM,4,0.235294,4,0.235294,float64,Categoric
STENOK_AN,7,0.411765,106,6.235294,float64,Categoric
FK_STENOK,5,0.294118,73,4.294118,float64,Categoric
...,...,...,...,...,...,...
ANT_CA_S_n,2,0.117647,13,0.764706,float64,Binary
GEPAR_S_n,2,0.117647,17,1.0,float64,Binary
ASP_S_n,2,0.117647,17,1.0,float64,Binary
TIKL_S_n,2,0.117647,16,0.941176,float64,Binary


# Myocardial infarction complications

Golovenkin, S., Shulman, V., Rossiev, D., Shesternya, P., Nikulina, S., Orlova, Y., & Voino-Yasenetsky, V. (2020). Myocardial infarction complications [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C53P5M.

In [16]:
 
# fetch dataset 
mammographic_mass = fetch_ucirepo(id=161) 
  
# data (as pandas dataframes) 
X = mammographic_mass.data.features 
y = mammographic_mass.data.targets 
  
# metadata 
print(mammographic_mass.metadata) 
display(X)
# variable information 
print(mammographic_mass.variables) 
ds.get_cardinality(X)

{'uci_id': 161, 'name': 'Mammographic Mass', 'repository_url': 'https://archive.ics.uci.edu/dataset/161/mammographic+mass', 'data_url': 'https://archive.ics.uci.edu/static/public/161/data.csv', 'abstract': "Discrimination of benign and malignant mammographic masses based on BI-RADS attributes and the patient's age.", 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 961, 'num_features': 5, 'feature_types': ['Integer'], 'demographics': ['Age'], 'target_col': ['Severity'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2007, 'last_updated': 'Thu Mar 28 2024', 'dataset_doi': '10.24432/C53K6Z', 'creators': ['Matthias Elter'], 'intro_paper': {'ID': 448, 'type': 'NATIVE', 'title': 'The prediction of breast cancer biopsy outcomes using two CAD approaches that both emphasize an intelligible decision process.', 'authors': 'M. Elter, R. Schulz-Wendtland, T. Wittenberg', 'v

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density
0,5.0,67.0,3.0,5.0,3.0
1,4.0,43.0,1.0,1.0,
2,5.0,58.0,4.0,5.0,3.0
3,4.0,28.0,1.0,1.0,3.0
4,5.0,74.0,1.0,5.0,
...,...,...,...,...,...
956,4.0,47.0,2.0,1.0,3.0
957,4.0,56.0,4.0,5.0,3.0
958,4.0,64.0,4.0,5.0,3.0
959,5.0,66.0,4.0,5.0,3.0


       name     role     type demographic description units missing_values
0   BI-RADS  Feature  Integer        None        None  None            yes
1       Age  Feature  Integer         Age        None  None            yes
2     Shape  Feature  Integer        None        None  None            yes
3    Margin  Feature  Integer        None        None  None            yes
4   Density  Feature  Integer        None        None  None            yes
5  Severity   Target   Binary        None        None  None             no
pandas.DataFrame shape: (961, 5)


Unnamed: 0,Card,%_Card,NaN_Values,%_NaN_Values,Type,Class
BI-RADS,7,0.728408,2,0.208117,float64,Categoric
Age,73,7.596254,5,0.520291,float64,Numeric - Discrete
Shape,4,0.416233,31,3.225806,float64,Categoric
Margin,5,0.520291,48,4.994797,float64,Categoric
Density,4,0.416233,76,7.908429,float64,Categoric


# Mammographic Mass

Elter, M. (2007). Mammographic Mass [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C53K6Z.

In [17]:
# fetch dataset 
mammographic_mass = fetch_ucirepo(id=161) 
  
# data (as pandas dataframes) 
X = mammographic_mass.data.features 
y = mammographic_mass.data.targets 
  
# metadata 
print(mammographic_mass.metadata) 
display(X)
print(mammographic_mass.variables) 
ds.get_cardinality(X)

{'uci_id': 161, 'name': 'Mammographic Mass', 'repository_url': 'https://archive.ics.uci.edu/dataset/161/mammographic+mass', 'data_url': 'https://archive.ics.uci.edu/static/public/161/data.csv', 'abstract': "Discrimination of benign and malignant mammographic masses based on BI-RADS attributes and the patient's age.", 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 961, 'num_features': 5, 'feature_types': ['Integer'], 'demographics': ['Age'], 'target_col': ['Severity'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2007, 'last_updated': 'Thu Mar 28 2024', 'dataset_doi': '10.24432/C53K6Z', 'creators': ['Matthias Elter'], 'intro_paper': {'ID': 448, 'type': 'NATIVE', 'title': 'The prediction of breast cancer biopsy outcomes using two CAD approaches that both emphasize an intelligible decision process.', 'authors': 'M. Elter, R. Schulz-Wendtland, T. Wittenberg', 'v

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density
0,5.0,67.0,3.0,5.0,3.0
1,4.0,43.0,1.0,1.0,
2,5.0,58.0,4.0,5.0,3.0
3,4.0,28.0,1.0,1.0,3.0
4,5.0,74.0,1.0,5.0,
...,...,...,...,...,...
956,4.0,47.0,2.0,1.0,3.0
957,4.0,56.0,4.0,5.0,3.0
958,4.0,64.0,4.0,5.0,3.0
959,5.0,66.0,4.0,5.0,3.0


       name     role     type demographic description units missing_values
0   BI-RADS  Feature  Integer        None        None  None            yes
1       Age  Feature  Integer         Age        None  None            yes
2     Shape  Feature  Integer        None        None  None            yes
3    Margin  Feature  Integer        None        None  None            yes
4   Density  Feature  Integer        None        None  None            yes
5  Severity   Target   Binary        None        None  None             no
pandas.DataFrame shape: (961, 5)


Unnamed: 0,Card,%_Card,NaN_Values,%_NaN_Values,Type,Class
BI-RADS,7,0.728408,2,0.208117,float64,Categoric
Age,73,7.596254,5,0.520291,float64,Numeric - Discrete
Shape,4,0.416233,31,3.225806,float64,Categoric
Margin,5,0.520291,48,4.994797,float64,Categoric
Density,4,0.416233,76,7.908429,float64,Categoric


# Cardiotocography

Campos, D. & Bernardes, J. (2000). Cardiotocography [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C51S4N.

In [18]:
# fetch dataset 
cardiotocography = fetch_ucirepo(id=193) 
  
# data (as pandas dataframes) 
X = cardiotocography.data.features 
y = cardiotocography.data.targets 
  
# metadata 
print(cardiotocography.metadata) 
display(X.head())
print(cardiotocography.variables) 
ds.get_cardinality(X)



{'uci_id': 193, 'name': 'Cardiotocography', 'repository_url': 'https://archive.ics.uci.edu/dataset/193/cardiotocography', 'data_url': 'https://archive.ics.uci.edu/static/public/193/data.csv', 'abstract': 'The dataset consists of measurements of fetal heart rate (FHR) and uterine contraction (UC) features on cardiotocograms classified by expert obstetricians.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 2126, 'num_features': 21, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['CLASS', 'NSP'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2000, 'last_updated': 'Fri Mar 15 2024', 'dataset_doi': '10.24432/C51S4N', 'creators': ['D. Campos', 'J. Bernardes'], 'intro_paper': None, 'additional_info': {'summary': '2126 fetal cardiotocograms (CTGs) were automatically processed and the respective diagnostic features measured. The CTGs were also classified

Unnamed: 0,LB,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,...,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency
0,120,0.0,0.0,0.0,0.0,0.0,0.0,73,0.5,43,...,64,62,126,2,0,120,137,121,73,1
1,132,0.006,0.0,0.006,0.003,0.0,0.0,17,2.1,0,...,130,68,198,6,1,141,136,140,12,0
2,133,0.003,0.0,0.008,0.003,0.0,0.0,16,2.1,0,...,130,68,198,5,1,141,135,138,13,0
3,134,0.003,0.0,0.008,0.003,0.0,0.0,16,2.4,0,...,117,53,170,11,0,137,134,137,13,1
4,132,0.007,0.0,0.008,0.0,0.0,0.0,16,2.4,0,...,117,53,170,9,0,137,136,138,11,1


        name     role        type demographic description units missing_values
0         LB  Feature     Integer        None        None  None             no
1         AC  Feature  Continuous        None        None  None             no
2         FM  Feature  Continuous        None        None  None             no
3         UC  Feature  Continuous        None        None  None             no
4         DL  Feature  Continuous        None        None  None             no
5         DS  Feature  Continuous        None        None  None             no
6         DP  Feature  Continuous        None        None  None             no
7       ASTV  Feature     Integer        None        None  None             no
8       MSTV  Feature  Continuous        None        None  None             no
9       ALTV  Feature     Integer        None        None  None             no
10      MLTV  Feature  Continuous        None        None  None             no
11     Width  Feature     Integer        None       

Unnamed: 0,Card,%_Card,NaN_Values,%_NaN_Values,Type,Class
LB,48,2.257761,0,0.0,int64,Numeric - Discrete
AC,20,0.940734,0,0.0,float64,Numeric - Discrete
FM,102,4.797742,0,0.0,float64,Numeric - Discrete
UC,16,0.752587,0,0.0,float64,Numeric - Discrete
DL,16,0.752587,0,0.0,float64,Numeric - Discrete
DS,2,0.094073,0,0.0,float64,Binary
DP,6,0.28222,0,0.0,float64,Categoric
ASTV,75,3.527752,0,0.0,int64,Numeric - Discrete
MSTV,57,2.681091,0,0.0,float64,Numeric - Discrete
ALTV,87,4.092192,0,0.0,int64,Numeric - Discrete


In [19]:
y['CLASS'].value_counts() 

CLASS
2     579
1     384
6     332
7     252
10    197
8     107
4      81
5      72
9      69
3      53
Name: count, dtype: int64

In [20]:
y['NSP'].value_counts() 

NSP
1    1655
2     295
3     176
Name: count, dtype: int64