# Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv("Data/heart_disease_uci.csv")
data.head()

# Potential problems to explore
#### 1. Does age or gender affect the likelihood of heart disease?

#### 2. Which risk factors are most common in patients with heart disease?

#### 3. What are some of the metric patterns that raise the likelihood of heart disease in patients ?

#### 4. Do men and women with heart disease tend to experience different types of chest pain?

# David Arzumanyan

In [None]:
data.shape

In [None]:
data = data.rename(columns={"trestbps": "Resting_Blood_Pressure", "fbs": "Fasting_Blood_Sugar_High", "thalch": "Max_Heart_Rate_Achieved",
                            "exang" : "Exercise_Induced_Angina", "oldpeak" : "ST_Depression_Exercise", "ca" : "Major_Vessels", "thal" : "Thalassemia_Type"})

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.info()

In [None]:
# Checking for duplicated rows
np.sum(data.duplicated())

In [None]:
# Checking inconsistent or invalid Entries
print(pd.unique(data["sex"].values))
print(pd.unique(data["dataset"].values))
print(pd.unique(data["cp"].values))
print(pd.unique(data["restecg"].values))
print(pd.unique(data["Exercise_Induced_Angina"].values))
print(pd.unique(data["ST_Depression_Exercise"].values))
print(pd.unique(data["slope"].values))
print(pd.unique(data["Major_Vessels"].values))
print(pd.unique(data["Thalassemia_Type"].values))
print(pd.unique(data["num"].values))

In [None]:
print(np.sum(data["restecg"].isna()))
data = data.dropna(subset=["restecg"])

In [None]:
print(np.sum(data["Exercise_Induced_Angina"].isna()))
data = data.dropna(subset="Exercise_Induced_Angina")

In [None]:
# There are some ST_Depression_Exercise values that are negative and unusal
# Since it's just 12 of them we will remove them
np.sum(data["ST_Depression_Exercise"] < 0)
data = data.drop(data[data["ST_Depression_Exercise"] < 0].index) 

In [None]:
# Fixing null "Slope" values

# Healty heart during exercies
print(np.sum(data[(data["Max_Heart_Rate_Achieved"] > 125) & (data["Exercise_Induced_Angina"] == False) & (data["ST_Depression_Exercise"] == 0.0)]["slope"].isna()))

# Medium ischemia
print(np.sum(data[((data["Max_Heart_Rate_Achieved"] > 115) & (data["Max_Heart_Rate_Achieved"] <= 125)) & 
     ((data["ST_Depression_Exercise"] > 1) & 
      (data["ST_Depression_Exercise"] <= 3.5))]["slope"].isna()))

# Significant ischemia
print(np.sum(data[(data["Max_Heart_Rate_Achieved"] < 120) & (data["Exercise_Induced_Angina"] == True) & (data["ST_Depression_Exercise"] > 3.5)]["slope"].isna()))

In [None]:
# Healthy heart slope
data.loc[(data["Max_Heart_Rate_Achieved"] > 125) & 
         (data["Exercise_Induced_Angina"] == False) & 
         (data["ST_Depression_Exercise"] == 0.0) & 
         (data["slope"].isna()), 
         "slope"] = "upsloping"

# Medium ischemia slope
data.loc[(data["Max_Heart_Rate_Achieved"] > 115) & 
         (data["Max_Heart_Rate_Achieved"] <= 125) & 
         (data["ST_Depression_Exercise"] > 1) & 
         (data["ST_Depression_Exercise"] <= 3.5) &
         (data["slope"].isna()),
         "slope"] = "flat"

# Significant ischemia slope
data.loc[(data["Max_Heart_Rate_Achieved"] < 120) & 
         (data["Exercise_Induced_Angina"] == True) & 
         (data["ST_Depression_Exercise"] > 3.5) &
         (data["slope"].isna()),
         "slope"] = "downsloping"

In [None]:
# Dropping the remaining null slope values
np.sum(data["slope"].isna())
data = data.dropna(subset=["slope"])

In [None]:
# Filling missing "Major_Vessels" (major vessels observed) values with -1 as a placeholder
# that a data is missing
data["Major_Vesselsa"] = data["Major_Vessels"].fillna(-1)

In [None]:
# "Thalassemia_Type" thallium stress test column
data["Thalassemia_Type"] = data["Thalassemia_Type"].fillna("unknown")

In [None]:
# "Fasting_Blood_Sugar_High" 
'''A fasting blood sugar over 120 mg/dL suggests hyperglycemia, which may indicate Diabetes mellitus, or
Impaired glucose tolerance (pre-diabetes). We will leave null values to indicate not performed tests''' 

| Cholesterol level (mg/dL) | Classification  | Meaning                       |
| ------------------------- | --------------- | ----------------------------- |
| `< 200`                   | Desirable       | Normal range                  |
| `200–239`                 | Borderline high | Elevated risk                 |
| `≥ 240`                   | High            | Risk factor for heart disease |


In [None]:
# "chol" - filling with -1 to indicate missing values
data["chol"] = data["chol"].fillna(-1)

In [None]:
# Filling Resting_Blood_Pressure with "unkown"
data["Resting_Blood_Pressure"] = data["Resting_Blood_Pressure"].fillna("unkown")

In [None]:
# Intentionally left the "fbs" nan values to flag those values as missing
data.info()

#### 1. Does age or gender affect the likelihood of heart disease?

In [None]:
# Adding a new column of patients having a heart disease
data["has_disease"] = (data["num"] > 0).astype(int)
data["has_disease"]

In [None]:
data["has_disease"].value_counts(normalize=True)

In [None]:
sns.histplot(x='age', hue='has_disease', data=data, multiple='stack')
plt.title("Age distribution by Heart Disease")

In [None]:
# Male/Female Heart Disease presence count
sns.countplot(x="sex", hue="has_disease", data=data)
plt.title("Male/Female Heart Disease count")

In [None]:
# Male/Female Heart Disease presence ratio
sns.barplot(x="sex", y="has_disease", data=data, estimator=np.mean)
plt.title("Male/Female Heart Disease ratio")

In [None]:
# Severity of heart disease based on gender
has_disease = data[data["num"] > 0]
plt.ylabel("Severity of disease")
sns.boxplot(x="sex", y="num", data=has_disease)

In [None]:
# Heart disease presence and severity based on age and gender
plt.figure(figsize=(9,5))
plt.xlabel("Severity of disease")
sns.boxplot(x="num", y="age", data=data, hue="sex")

In [None]:
plt.figure(figsize=(9,5))
plt.xlabel("Severity of disease")
sns.lineplot(x="num", y="age", data=data, hue="sex")

#### Do men and women with heart disease tend to experience different types of chest pain?

| Value | Name                 | Meaning                                                                                                                                                | Typical Risk of Heart Disease                             |
| ----- | -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------- |
| **1** | **Typical Angina**   | Classic chest pain caused by reduced blood flow to the heart (ischemia). Usually occurs with exertion or stress and relieved by rest or nitroglycerin. | **High**                                                  |
| **2** | **Atypical Angina**  | Chest discomfort that shares some features with angina but not all — may occur at rest or vary in character.                                           | **Moderate**                                              |
| **3** | **Non-anginal Pain** | Chest pain *not* related to heart muscle oxygen deprivation (e.g., due to anxiety, indigestion, or musculoskeletal causes).                            | **Low**                                                   |
| **4** | **Asymptomatic**     | No chest pain at all, but patient shows other signs of heart disease (like abnormal ECG or stress test results).                                       | **Often High** — especially in older or diabetic patients |


In [None]:
# Chest pain types of genders
sns.countplot(x="sex", hue="cp", data=has_disease)

In [None]:
# Chest pain types of genders
plt.figure(figsize=(9,5))
sns.boxplot(x="cp", y="age", hue="sex", data=has_disease)

In [None]:
# Crosstab for heatmap
cp_gender = pd.crosstab(has_disease["cp"], has_disease["sex"])
cp_gender

In [None]:
# Heatmap of chest pain types by gender
plt.figure(figsize=(8,5))
sns.heatmap(cp_gender, annot=True, fmt="d", cmap="coolwarm")
plt.title("Chest pain types by gender (Patients with Heart Disease)")
plt.xlabel("Gender")
plt.ylabel("Chest Pain Type")

# Daniel Tapia

# Raghav Vaid

In [None]:
data.describe()

In [None]:
# has_disease = data[data["has_disease"] == 1]
no_disease = data[data["has_disease"] == 0]

numeric_cols = data.select_dtypes(include=["number"]).columns

comparison = pd.DataFrame({
    "With disease": has_disease[numeric_cols].mean(),
    "Without disease": no_disease[numeric_cols].mean()
})

comparison

'''
id: a unique identifier for each patient/record

age: age of patient (years)

Resting_Blood_Pressure: resting blood pressure (in mm Hg) measured
when the patient is at rest

chol: serum cholesterol (in mg/dL)

Max_Heart_Rate_Achieved: max heart rate achieved during exercise

ST_Depression_Exercise: how much the ST segment goes down during exercise 
compared to rest (ST segment is the flat part)

Major_Vessels: number of major vessels (0-3) colored by flouroscopy
''' 

In [None]:
data.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# data = data.rename(columns={"trestbps": "Resting_Blood_Pressure", "fbs": "Fasting_Blood_Sugar_High", "thalch": "Max_Heart_Rate_Achieved",
#                             "exang" : "Exercise_Induced_Angina", "oldpeak" : "ST_Depression_Exercise", "ca" : "Major_Vessels", "thal" : "Thalassemia_Type"})

features = ["age", "Resting_Blood_Pressure", "chol", "Max_Heart_Rate_Achieved", "ST_Depression_Exercise", "Major_Vessels"]

# makes boxblots for each feature for people with and without heart disease
for col in features:
    plt.figure(figsize=(5,3))
    sns.boxplot(x="has_disease", y=col, data= data)
    plt.title(f"{col} vs Heart Disease")
    plt.show()

In [None]:
numeric_data = data.select_dtypes(include= ["int64", "float64"])
# print(numeric_data.columns)

plt.figure(figsize= (10,8))
sns.heatmap(numeric_data.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Feature Correlations with Heart Disease")
plt.show()

# Conclusion

Based on the data analysis and heatmap correlations, the key factors that increase the likelihood of heart disease are:

- Higher oldpeak (greater ST depression)
- Lower thalch (lower max heart rate)
- Slightly older age

Other factors like cholesterol and resting blood pressure show weak or inconsistent relationships. This aligns with known medical trends, where exercise-induced stress test results are strong indicators of cardiovascular risk.