# Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

In [2]:
data = pd.read_csv("Data/heart_disease_uci.csv")
data.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [3]:
data.shape

(920, 16)

In [4]:
data.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [6]:
# Checking for duplicated rows
np.sum(data.duplicated())

np.int64(0)

In [7]:
# Checking inconsistent or invalid Entries
print(pd.unique(data["sex"].values))
print(pd.unique(data["dataset"].values))
print(pd.unique(data["cp"].values))
print(pd.unique(data["restecg"].values))
print(pd.unique(data["exang"].values))
print(pd.unique(data["oldpeak"].values))
print(pd.unique(data["slope"].values))
print(pd.unique(data["ca"].values))
print(pd.unique(data["thal"].values))
print(pd.unique(data["num"].values))

['Male' 'Female']
['Cleveland' 'Hungary' 'Switzerland' 'VA Long Beach']
['typical angina' 'asymptomatic' 'non-anginal' 'atypical angina']
['lv hypertrophy' 'normal' 'st-t abnormality' nan]
[False True nan]
[ 2.3  1.5  2.6  3.5  1.4  0.8  3.6  0.6  3.1  0.4  1.3  0.   0.5  1.6
  1.   1.2  0.2  1.8  3.2  2.4  2.   2.5  2.2  2.8  3.   3.4  6.2  4.
  5.6  2.9  0.1  2.1  1.9  4.2  0.9  1.1  3.8  0.7  0.3  4.4  5.   nan
 -1.1 -1.5 -0.1 -2.6 -0.7 -2.  -1.   1.7 -0.8 -0.5 -0.9  3.7]
['downsloping' 'flat' 'upsloping' nan]
[ 0.  3.  2.  1. nan]
['fixed defect' 'normal' 'reversable defect' nan]
[0 2 1 3 4]


In [10]:
print(np.sum(data["restecg"].isna()))
data = data.dropna(subset=["restecg"])

2


In [9]:
print(np.sum(data["exang"].isna()))
data = data.dropna(subset="exang")

55


In [52]:
# There are some oldpeak values that are negative and unusal
# Since it's just 12 of them we will remove them
np.sum(data["oldpeak"] < 0)
data = data.drop(data[data["oldpeak"] < 0].index) 

In [11]:
# Fixing null "Slope" values

# Healty heart during exercies
print(np.sum(data[(data["thalch"] > 125) & (data["exang"] == False) & (data["oldpeak"] == 0.0)]["slope"].isna()))

# Medium ischemia
print(np.sum(data[((data["thalch"] > 115) & (data["thalch"] <= 125)) & 
     ((data["oldpeak"] > 1) & 
      (data["oldpeak"] <= 3.5))]["slope"].isna()))

# Significant ischemia
print(np.sum(data[(data["thalch"] < 120) & (data["exang"] == True) & (data["oldpeak"] > 3.5)]["slope"].isna()))

155
2
0


In [12]:
# Healthy heart slope
data.loc[(data["thalch"] > 125) & 
         (data["exang"] == False) & 
         (data["oldpeak"] == 0.0) & 
         (data["slope"].isna()), 
         "slope"] = "upsloping"

# Medium ischemia slope
data.loc[(data["thalch"] > 115) & 
         (data["thalch"] <= 125) & 
         (data["oldpeak"] > 1) & 
         (data["oldpeak"] <= 3.5) &
         (data["slope"].isna()),
         "slope"] = "flat"

# Significant ischemia slope
data.loc[(data["thalch"] < 120) & 
         (data["exang"] == True) & 
         (data["oldpeak"] > 3.5) &
         (data["slope"].isna()),
         "slope"] = "downsloping"

In [13]:
# TODO
data[data["slope"].isna()]

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
326,327,37,Male,Hungary,atypical angina,130.0,283.0,False,st-t abnormality,98.0,False,0.0,,,,0
337,338,39,Male,Hungary,atypical angina,130.0,,False,normal,120.0,False,0.0,,,,0
338,339,39,Male,Hungary,atypical angina,190.0,241.0,False,normal,106.0,False,0.0,,,,0
366,367,43,Female,Hungary,atypical angina,120.0,266.0,False,normal,118.0,False,0.0,,,,0
370,371,44,Female,Hungary,asymptomatic,120.0,218.0,False,st-t abnormality,115.0,False,0.0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
907,908,58,Male,VA Long Beach,non-anginal,150.0,219.0,False,st-t abnormality,118.0,True,0.0,,,,2
909,910,68,Male,VA Long Beach,non-anginal,134.0,254.0,True,normal,151.0,True,0.0,,,normal,0
913,914,62,Male,VA Long Beach,asymptomatic,158.0,170.0,False,st-t abnormality,138.0,True,0.0,,,,1
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2


In [14]:
# TODO ca missing values
data["ca"].value_counts(dropna=False)

ca
NaN    554
0.0    181
1.0     67
2.0     41
3.0     20
Name: count, dtype: int64

# Potential problems to explore
#### 1. Does age or gender affect the likelihood of heart disease?

#### 2. Which risk factors are most common in patients with heart disease?

#### 3. What are some of the metric patterns that raise the likelihood of heart disease in patients ?

# David Arzumanyan

# Daniel Tapia

# Raghav Vaid