In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

Note: Data is from the UCI Machine Learning Repository:

Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

In [2]:
# data: https://archive.ics.uci.edu/ml/datasets/heart+disease
heart = pd.read_csv('processed.cleveland.data.csv')

In [3]:
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


- age: age in years
- sex: 1=male, 0=female
- cp: chest pain type
 - Value 1: typical angina
 - Value 2: atypical angina
 - Value 3: non-anginal pain
 - Value 4: asymptomatic
- trestbps: resting blood pressure (in mm Hg on admission to the hospital)
- chol: serum cholestoral in mg/dl
- fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
- restecg: resting electrocardiographic results
 - Value 0: normal
 - Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) 
 - Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
- thalach: maximum heart rate achieved in an exercise test
- exang: exercise induced angina (1 = yes; 0 = no)
- oldpeak: ST depression induced by exercise relative to rest
- slope: the slope of the peak exercise ST segment
 - Value 1: upsloping
 - Value 2: flat
 - Value 3: downsloping
- ca: number of major vessels (0-3) colored by flourosopy
- thal: 
 - Value 3: normal
 - Value 6: fixed defect
 - Value 7: reversable defect
- heart_disease: diagnosis of heart disease (angiographic disease status)
 - Value 0: < 50% diameter narrowing
 - Value 1: > 50% diameter narrowing
"\[This field\] refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4. Experiments with the Cleveland database have concentrated on simply attempting to distinguish presence (values 1,2,3,4) from absence (value 0)."


In [4]:
heart.describe(include="all")

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
unique,,,,,,,,,,,,5.0,4.0,
top,,,,,,,,,,,,0.0,3.0,
freq,,,,,,,,,,,,176.0,166.0,
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,,,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,,,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,,,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,,,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,,,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,,,2.0


In [5]:
print(heart.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age              303 non-null float64
sex              303 non-null float64
cp               303 non-null float64
trestbps         303 non-null float64
chol             303 non-null float64
fbs              303 non-null float64
restecg          303 non-null float64
thalach          303 non-null float64
exang            303 non-null float64
oldpeak          303 non-null float64
slope            303 non-null float64
ca               303 non-null object
thal             303 non-null object
heart_disease    303 non-null int64
dtypes: float64(11), int64(1), object(2)
memory usage: 33.2+ KB
None


In [6]:
heart.ca.unique()

array(['0.0', '3.0', '2.0', '1.0', '?'], dtype=object)

In [7]:
heart.thal.unique()

array(['6.0', '3.0', '7.0', '?'], dtype=object)

In [8]:
heart = heart.replace('?', np.nan)

In [9]:
print(heart.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age              303 non-null float64
sex              303 non-null float64
cp               303 non-null float64
trestbps         303 non-null float64
chol             303 non-null float64
fbs              303 non-null float64
restecg          303 non-null float64
thalach          303 non-null float64
exang            303 non-null float64
oldpeak          303 non-null float64
slope            303 non-null float64
ca               299 non-null object
thal             301 non-null object
heart_disease    303 non-null int64
dtypes: float64(11), int64(1), object(2)
memory usage: 33.2+ KB
None


In [10]:
heart.ca = heart.ca.astype('float')

In [11]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age              303 non-null float64
sex              303 non-null float64
cp               303 non-null float64
trestbps         303 non-null float64
chol             303 non-null float64
fbs              303 non-null float64
restecg          303 non-null float64
thalach          303 non-null float64
exang            303 non-null float64
oldpeak          303 non-null float64
slope            303 non-null float64
ca               299 non-null float64
thal             301 non-null object
heart_disease    303 non-null int64
dtypes: float64(12), int64(1), object(1)
memory usage: 33.2+ KB


In [12]:
#cp: chest pain type
#Value 1: typical angina
#Value 2: atypical angina
#Value 3: non-anginal pain
#Value 4: asymptomatic
#inplace modifies the original data without using =
heart.cp.replace({1.0: 'typical angina', 2.0: 'atypical angina', 3.0: 'non-anginal pain', 4.0: 'asymptomatic'}, inplace = True)

In [13]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age              303 non-null float64
sex              303 non-null float64
cp               303 non-null object
trestbps         303 non-null float64
chol             303 non-null float64
fbs              303 non-null float64
restecg          303 non-null float64
thalach          303 non-null float64
exang            303 non-null float64
oldpeak          303 non-null float64
slope            303 non-null float64
ca               299 non-null float64
thal             301 non-null object
heart_disease    303 non-null int64
dtypes: float64(11), int64(1), object(2)
memory usage: 33.2+ KB


In [14]:
#sex: 1=male, 0=female
heart.sex.replace({0.0: 'female', 1.0: 'male'}, inplace=True)

In [15]:
heart.describe(include = 'all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
count,303.0,303,303,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,301.0,303.0
unique,,2,4,,,,,,,,,,3.0,
top,,male,asymptomatic,,,,,,,,,,3.0,
freq,,206,144,,,,,,,,,,166.0,
mean,54.438944,,,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241,,0.937294
std,9.038662,,,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438,,1.228536
min,29.0,,,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,,0.0
25%,48.0,,,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,,0.0
50%,56.0,,,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,,0.0
75%,61.0,,,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,,2.0


In [16]:
#slope: the slope of the peak exercise ST segment
#Value 1: upsloping
#Value 2: flat
#Value 3: downsloping
heart.slope = heart.slope.replace({1.0: 'upsloping', 2.0: 'flat', 3.0: 'downsloping'})

In [17]:
#pd.Categorical means that this data is understood as categorical data by the system
heart.slope = pd.Categorical(heart.slope, ['upsloping', 'flat', 'downsloping'], ordered=True)

In [18]:
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
0,63.0,male,typical angina,145.0,233.0,1.0,2.0,150.0,0.0,2.3,downsloping,0.0,6.0,0
1,67.0,male,asymptomatic,160.0,286.0,0.0,2.0,108.0,1.0,1.5,flat,3.0,3.0,2
2,67.0,male,asymptomatic,120.0,229.0,0.0,2.0,129.0,1.0,2.6,flat,2.0,7.0,1
3,37.0,male,non-anginal pain,130.0,250.0,0.0,0.0,187.0,0.0,3.5,downsloping,0.0,3.0,0
4,41.0,female,atypical angina,130.0,204.0,0.0,2.0,172.0,0.0,1.4,upsloping,0.0,3.0,0


In [19]:
heart.slope.cat.codes

0      2
1      1
2      1
3      2
4      0
5      0
6      2
7      0
8      1
9      2
10     1
11     1
12     1
13     0
14     0
15     0
16     2
17     0
18     0
19     0
20     1
21     0
22     1
23     0
24     1
25     1
26     0
27     2
28     0
29     1
      ..
273    1
274    0
275    1
276    1
277    1
278    0
279    1
280    1
281    0
282    1
283    0
284    0
285    2
286    1
287    1
288    0
289    2
290    1
291    0
292    2
293    0
294    1
295    0
296    1
297    1
298    1
299    1
300    1
301    1
302    0
Length: 303, dtype: int8

In [20]:
heart.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
count,303.0,303,303,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303,299.0,301.0,303.0
unique,,2,4,,,,,,,,3,,3.0,
top,,male,asymptomatic,,,,,,,,upsloping,,3.0,
freq,,206,144,,,,,,,,142,,166.0,
mean,54.438944,,,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,,0.672241,,0.937294
std,9.038662,,,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,,0.937438,,1.228536
min,29.0,,,94.0,126.0,0.0,0.0,71.0,0.0,0.0,,0.0,,0.0
25%,48.0,,,120.0,211.0,0.0,0.0,133.5,0.0,0.0,,0.0,,0.0
50%,56.0,,,130.0,241.0,0.0,1.0,153.0,0.0,0.8,,0.0,,0.0
75%,61.0,,,140.0,275.0,0.0,2.0,166.0,1.0,1.6,,1.0,,2.0


In [21]:
heart.describe()

Unnamed: 0,age,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,heart_disease
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,303.0
mean,54.438944,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,0.672241,0.937294
std,9.038662,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.937438,1.228536
min,29.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0
25%,48.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,0.0,0.0
50%,56.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,0.0,0.0
75%,61.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,1.0,2.0
max,77.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,4.0
