In [2]:
#Imports initial packages
%pip install pandas numpy
import pandas as pd
import numpy as np


[notice] A new release of pip is available: 24.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


In [3]:
#Prints first 5 rows of our dataset
df = pd.read_csv('heart_disease_uci.csv')

df.head(5)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [4]:
#Point 1 - Shape
df.shape

print(f'We have {df.shape[0]} rows and {df.shape[1]} columns in our dataset')
print(f'Our total number of data points is {df.shape[0] * df.shape[1]} points')

We have 920 rows and 16 columns in our dataset
Our total number of data points is 14720 points


Within the dataset there are 920 rows (patients) and then 16 different columns, all representing different statistics relating to the patients. This gives us 14720 data points across the dataset.

The columns are 'id' = id/index of the row, 'age' = age of the patient, 'dataset' = what hospital the data was sourced from, 'cp' = what kind of chest pain that the patient is experiencing, 'trestbps' = resting blood pressure in mm Hgs when admitted to the hospital, 'chol' = cholesterol levels in the patient, 'fbs' = fasting blood sugar over 120 mg/dl, 'restecg' = resting electrocardiographic results, 'thalach' = maximum heart rate achieved, 'exang' = exercise induced angina, 'oldpeak' = ST depression induced by exercise relative to rest, 'slope' = the slope of peak exercise ST segment, 'ca' = number of major vessels affected by fluroscopy, 'thal' = defect type, and 'num' = predicted attribute.

In [5]:
#Point 2 - Column Names
print("Column Names:")
print(df.columns.tolist())

print(f"\n Total columns: {len(df.columns)}")

Column Names:
['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']

 Total columns: 16


In [6]:
#Point 3 - Data Types
print("Data Types:")
print(df.dtypes)

print("\n" + "="*50)
print("Data Type Summary:")
print(df.dtypes.value_counts())

Data Types:
id            int64
age           int64
sex             str
dataset         str
cp              str
trestbps    float64
chol        float64
fbs          object
restecg         str
thalch      float64
exang        object
oldpeak     float64
slope           str
ca          float64
thal            str
num           int64
dtype: object

Data Type Summary:
str        6
float64    5
int64      3
object     2
Name: count, dtype: int64


In [7]:
# Point 4 - Head
print("First 5 Rows:")
df.head()

First 5 Rows:


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [8]:
# Point 5 - Tail
print("Last 5 Rows:")
df.tail()

Last 5 Rows:


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0
919,920,62,Male,VA Long Beach,atypical angina,120.0,254.0,False,lv hypertrophy,93.0,True,0.0,,,,1


In [9]:
# Point 6 - Memory Usage
print("Memory Usage by Column:")
print(df.memory_usage(deep=True))

# Total memory in MB
total_memory_mb = df.memory_usage(deep=True).sum() / 1e6
print(f"\n Total Memory Usage: {total_memory_mb:.2f} MB")

Memory Usage by Column:
Index         132
id           7360
age          7360
sex         49148
dataset     53820
cp          56530
trestbps     7360
chol         7360
fbs         32760
restecg     53848
thalch       7360
exang       32900
oldpeak      7360
slope       43727
ca           7360
thal        41810
num          7360
dtype: int64

 Total Memory Usage: 0.42 MB


In [10]:
# Point 7 - Missing Values
print("Missing Values by Column:")

# Calculate raw counts and percentages
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

# Combine them into a single summary table
missing_summary = pd.DataFrame({
    'Missing Values': missing,
    'Percentage (%)': missing_pct
})

# Display the table, rounding percentages to 2 decimal places for cleanliness
print(missing_summary.round(2))

print("\n" + "="*50)

# Calculate total missing values
total_missing = missing.sum()
print(f" Total Missing Values: {total_missing}")

# Check condition
if total_missing == 0:
    print("Great! No missing values - complete dataset!")
else:
    print(f" {total_missing} missing values need attention")

Missing Values by Column:
          Missing Values  Percentage (%)
id                     0            0.00
age                    0            0.00
sex                    0            0.00
dataset                0            0.00
cp                     0            0.00
trestbps              59            6.41
chol                  30            3.26
fbs                   90            9.78
restecg                2            0.22
thalch                55            5.98
exang                 55            5.98
oldpeak               62            6.74
slope                309           33.59
ca                   611           66.41
thal                 486           52.83
num                    0            0.00

 Total Missing Values: 1759
 1759 missing values need attention


In [11]:
# Point 8 - Duplicates
duplicate_count = df.duplicated().sum()
duplicate_pct = (duplicate_count / len(df)) * 100

print(f" Duplicate Rows: {duplicate_count:,}")
print(f" Percentage: {duplicate_pct:.2f}%")

if duplicate_count > 0:
    print(f"\n Warning: {duplicate_pct:.2f}% of rows are duplicates!")
    print("   This needs investigation in Week 4 (Data Cleaning)")

 Duplicate Rows: 0
 Percentage: 0.00%


In [12]:
# Point 9 - Descriptive Statistics
print("Descriptive Statistics:")
df.describe().T

Descriptive Statistics:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,920.0,460.5,265.725422,1.0,230.75,460.5,690.25,920.0
age,920.0,53.51087,9.424685,28.0,47.0,54.0,60.0,77.0
trestbps,861.0,132.132404,19.06607,0.0,120.0,130.0,140.0,200.0
chol,890.0,199.130337,110.78081,0.0,175.0,223.0,268.0,603.0
thalch,865.0,137.545665,25.926276,60.0,120.0,140.0,157.0,202.0
oldpeak,858.0,0.878788,1.091226,-2.6,0.0,0.5,1.5,6.2
ca,309.0,0.676375,0.935653,0.0,0.0,0.0,1.0,3.0
num,920.0,0.995652,1.142693,0.0,0.0,1.0,2.0,4.0


In [13]:
# Point 10 - Unique Values
print("Unique Values per Column:")
unique_counts = df.nunique().sort_values()
print(unique_counts)

Unique Values per Column:
sex           2
fbs           2
exang         2
restecg       3
slope         3
thal          3
cp            4
dataset       4
ca            4
num           5
age          50
oldpeak      53
trestbps     61
thalch      119
chol        217
id          920
dtype: int64


In [14]:
#Categorizes columns by their unique counts to determine cardinality
print("\n" + "="*50)
print("Feature Classification by Unique Values:")
print("="*50)

binary = unique_counts[unique_counts == 2].index.tolist()
low_cardinality = unique_counts[(unique_counts > 2) & (unique_counts <= 5)].index.tolist()
high_cardinality = unique_counts[unique_counts > 5].index.tolist()

print(f"\nðŸŸ¢ BINARY (2 values): {binary}")
print(f"\nðŸ”µ LOW CARDINALITY (3-5 values): {low_cardinality}")
print(f"\nðŸŸ£ HIGH CARDINALITY (>5 values): {high_cardinality}")


Feature Classification by Unique Values:

ðŸŸ¢ BINARY (2 values): ['sex', 'fbs', 'exang']

ðŸ”µ LOW CARDINALITY (3-5 values): ['restecg', 'slope', 'thal', 'cp', 'dataset', 'ca', 'num']

ðŸŸ£ HIGH CARDINALITY (>5 values): ['age', 'oldpeak', 'trestbps', 'thalch', 'chol', 'id']


## Part 2: Data Dictionary (20 points)

Complete the following data dictionary. For each column, you must:
1. **Research** the clinical meaning (these are standard cardiac assessment terms)
2. **Identify** the feature type (Continuous, Discrete, Categorical-Nominal, Categorical-Ordinal, Binary, Identifier)
3. **Document** the valid values/range you observe
4. **Note** any issues or questions

| Column | Description | Feature Type | Valid Values/Range | Notes/Issues |
|--------|-------------|--------------|-------------------|--------------|
| `id` | Unique id for each patient | identifier | | |
| `age` | Age of the patient in years | discrete | 28 - 77 | |
| `sex` | Male/Female | categorical | categorical nominal | |
| `dataset` | location of data collection | categorical nominal | | |
| `cp` | chest pain type (typical angina, atypical angina, non-anginal, asymptomatic) | categorical nominal | | |
| `trestbps` | resting blood pressure (resting blood pressure (in mm Hg on admission to the hospital) | continuous | 0 - 200 | |
| `chol` | serum cholesterol in mg/dl | continuous | 0 - 603 | |
| `fbs` | if fasting blood sugar > 120 mg/dl | binary | | |
| `restecg` | resting electrocardiographic results. Values: (normal, stt abnormality, lv hypertrophy) | categorical nominal | | |
| `thalch` | maximum heart rate achieved | continuous| 60 - 202 | |
| `exang` | exercise-induced angina (True/ False) | binary | | |
| `oldpeak` | ST depression induced by exercise relative to rest | continuous | -2.6 - 6.2 | |
| `slope` | the slope of the peak exercise ST segment | categorical nominal | | high amounts of null vals |
| `ca` | number of major vessels (0-3) colored by fluoroscopy | discrete | 0 - 3 | high amounts of null vals |
| `thal` | [normal; fixed defect; reversible defect] | categorical nominal | | high amounts of null vals |
| `num` | the predicted attribute | discrete | 0 - 4 | |
### Clinical Research Questions for Version B

Answer these questions based on your research (you may need to use Google):

**1. What is hypertension? According to current American Heart Association guidelines, what are the blood pressure thresholds for normal, elevated, Stage 1 hypertension, and Stage 2 hypertension?**

Your answer: Hypertension, also known as high blood pressure is when the force of your blood pressing against your arteries is too high. Hypertension can lead to heart issues, strokes heart disease, and various other medical issues. Normal blood pressure is considered to be when systolic mm Hg is less than 120 and diastolic mm Hg is less than 80. Elevated blood pressure is when systolic mm Hg is between 120-129 and diastolic mm Hg is less than 80. Stage 1 hypertension is when systolic mm Hg is between 130-139 and diastolic mm Hg is between 80-89. Stage 2 hypertension is when systolic mm Hg is 140 or higher and diastolic mm Hg is higher than 90.

---

**2. What is resting ECG (restecg)? What do "normal," "ST-T abnormality," and "left ventricular hypertrophy" findings indicate about heart health?**

Your answer: Resting ECG (restecg) is a resting 12 lead cardiography test that detects abnormalities such as arrhythmias, coronary heart disease, and other conditions. An ECG takes a snapshot of the electrical activity of your heart. A normal finding means that the timing and strength of a heart falls within the normal range, while this is a good finding it doesn't rule out all heart conditions. ST-T abnormalities are when there is a change in the ST segment and the T wave, with the heart muscle resetting in between beats. The most common causes for this are Ischemia where the heart isn't getting enough oxygen/blood due to clogged arteries, electrolyte imbalances due to calcium and potassium, or medication side effects. Left Ventricular Hypertrophy/LVH is when the muscle wall of the main pumping chamber/left ventricle becomes thicker/enlarged. The causes of this are most commonly high blood pressure where the increased pressure makes the heart have to work harder to pump blood causing the muscle to become larger, valve issues where blood is harder to squeeze through and move, and athletes heart where increased physical activity can cause the heart to become stronger due to long term endurance training (marathon runners). These ECG findings are all indicators that can help us determine what conditions patients may be facing. 

---

**3. What is fasting blood sugar (fbs)? Why is the threshold of 120 mg/dl clinically significant? What does elevated fasting blood sugar indicate about diabetes risk?**

Your answer: Fasting blood sugar is a test where patient's blood sugar is measured after they haven't consumed anything other than water for 8-12 hours. A normal range is considered to be less than 100 mg/dl, Prediabetes 100-125mg/dl, and Diabetes is considered to be 126 mg/dl. 120 mg/dl places a patient very firmly in the range of prediabetes, with symptoms most likely arising already. Elevated fasting blood sugar while not diabetes yet already causes damage to the body with the lining of blood vessels, neuropathy, retinopathy, kidney damage, and loss of beta cells already possibly occuring within the body. 

---

**4. What does the number of major vessels colored by fluoroscopy (ca) tell us? Why might having more blocked vessels indicate worse heart disease?**

Your answer: Fluoroscopy shows doctors how well blood actually flows throughout the body, in a healthy heart they should get a score of 3 meaning that the dye is flowing and apparent in all 3 major blood vessels (Left Anterior Descending (LAD), Left Circumflex (LCX), and Right Coronary Artery (RCA). Scores lower than 3 means that one is blocked the patient is at risk of heart disease/failure. Most of the time heart disease starts in one area/one vessel meaning that if multiple are blocked, the disease is very widespread or the patient may be undergoing multiple conditions. The body cannot function healthily if even one of these vessels is damaged/at risk.

---

In [15]:
## Part 3: Data Validation (15 points)

### 3.1 Blood Pressure Validation (5 points)

# Check for extreme outliers
impossible_high = df[df['trestbps'] > 250]
impossible_low = df[(df['trestbps'] < 60) & (df['trestbps'] != 0)]

print(f"Values > 250: {len(impossible_high)}")
print(f"Values < 60 (excluding 0): {len(impossible_low)}")

# Create a comparison table
cols_to_check = ['trestbps', 'chol', 'oldpeak', 'ca']

for col in cols_to_check:
    official_nulls = df[col].isnull().sum()
    zero_counts = (df[col] == 0).sum()
    print(f"Column: {col}")
    print(f"  - .isnull() reports: {official_nulls}")
    print(f"  - Zeros found:      {zero_counts}")
    print("-" * 25)


Values > 250: 0
Values < 60 (excluding 0): 0
Column: trestbps
  - .isnull() reports: 59
  - Zeros found:      1
-------------------------
Column: chol
  - .isnull() reports: 30
  - Zeros found:      172
-------------------------
Column: oldpeak
  - .isnull() reports: 62
  - Zeros found:      370
-------------------------
Column: ca
  - .isnull() reports: 611
  - Zeros found:      181
-------------------------


- Are there any impossible blood pressure values?
There are 0 impossible blood pressure values ie none over 250 or under 60.
- How should values of 0 be treated - as missing data or as valid values?
Values of 0 can be viewed by a case by case basis for interpretation, but in the case of "trestbps" they mean missing data due to them being an impossible value and 
- Does this match what `.isnull()` reports for this column?
For trestbps it does not match it because there are 59 cases of .isnull results and only one zero. This could be attributed to human error and a result of the dataset being compiled from 4 different hospitals.

Cholesterol

In [16]:
import pandas as pd
import numpy as np

# 1. Basic counts of missingness
official_nulls = df['chol'].isnull().sum()
zero_values = (df['chol'] == 0).sum()

# 2. Statistical summary of non-zero data
# We filter out zeros to get a true sense of the "real" distribution
real_chol = df[df['chol'] > 0]['chol']

print(f"--- Cholesterol Health Check ---")
print(f"Official NaN values: {official_nulls}")
print(f"Zero values (Hidden Nulls): {zero_values}")
print(f"Total 'Missing' Data: {official_nulls + zero_values}")
print(f"\n--- Distribution of Valid Data ---")
print(f"Min (Non-zero): {real_chol.min()}")
print(f"Max:            {real_chol.max()}")
print(f"Median:         {real_chol.median()}")

# 3. Identify extreme outliers (Physiologically suspicious)
# Values over 500 are rare and represent severe hypercholesterolemia
extreme_high = df[df['chol'] > 500]
print(f"\nRecords with Chol > 500: {len(extreme_high)}")

# 4. Final check: Does .isnull() match reality?
matches = (official_nulls == (official_nulls + zero_values))
print(f"\nDoes .isnull() report all missing data? {matches}")

--- Cholesterol Health Check ---
Official NaN values: 30
Zero values (Hidden Nulls): 172
Total 'Missing' Data: 202

--- Distribution of Valid Data ---
Min (Non-zero): 85.0
Max:            603.0
Median:         239.5

Records with Chol > 500: 4

Does .isnull() report all missing data? False


- Did you find any cholesterol values of 0?
There were 172 cholesterol values of 0 and 30 null values, giving us 202 instances of missing cholesterol.
- Is a cholesterol level of 0 clinically possible?
A cholesterol level of 0 is not clinically possible unless through very very rare circumstances with genetic disorders occurring.
- How many such impossible values exist?
The impossible values that exist within our dataset for cholesterol are the 172 values of 0 and then 4 records that have chol levels over 500.
- What would happen if you calculated the mean cholesterol without handling this?
- 