In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

### Variable definitions in the Dataset

- **age**: age of the patient (in years)
- **sex**: sex of the patient
    - 0: female
    - 1: male
- **cp**: Chest pain type
    - 1: typical angina
    - 2: atypical angina
    - 3: non-anginal pain
    - 4: asymptomatic
- **trtbps**: resting blood pressure (in mm Hg)
- **chol**: serum cholestoral in mg/dl
- **fbs**: fasting blood sugar > 120 mg/dl
    - 0: False
    - 1: True
- **restecg**: resting electrocardiographic results
    - 0: hypertrophy
    - 1: normal
    - 2: having ST-T wave abnormality
- **thalachh**: maximum heart rate achieved
- **exng**: exercise induced angina
    - 0: no
    - 1: yes
- **oldpeak**: ST depression induced by exercise relative to rest
- **slp**: the slope of the peak exercise ST segment
    - 0: downsloping
    - 1: flat
    - 2: upsloping
- **caa**: number of major vessels (0-3)
- **thall**
    - 1: fixed defect
    - 2: normal
    - 3: reversable defect
- **output**: the predicted attribute
    - 0: less chance of heart attack
    - 1: more chance of heart attack

In [4]:
heart_df = pd.read_csv('/Users/daniel/Document/github/heart_attack_analysis/heart.csv')
heart_df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trtbps    303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalachh  303 non-null    int64  
 8   exng      303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slp       303 non-null    int64  
 11  caa       303 non-null    int64  
 12  thall     303 non-null    int64  
 13  output    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [6]:
print('Shape of dataset:',heart_df.shape)

Shape of dataset: (303, 14)


In [7]:
heart_df.columns

Index(['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall', 'output'],
      dtype='object')

In [8]:
unique = []

for i in heart_df.columns:
    unique.append(heart_df[i].nunique())

pd.DataFrame(unique,index=heart_df.columns,columns=['Unique Variables'])

Unnamed: 0,Unique Variables
age,41
sex,2
cp,4
trtbps,49
chol,152
fbs,2
restecg,3
thalachh,91
exng,2
oldpeak,40


In [10]:
null = []
for i in heart_df.columns:
    null.append(heart_df[i].isnull().sum())

pd.DataFrame(null,index=heart_df.columns,columns=['Missing Values'])

Unnamed: 0,Missing Values
age,0
sex,0
cp,0
trtbps,0
chol,0
fbs,0
restecg,0
thalachh,0
exng,0
oldpeak,0


In [11]:
cat_cols = ['sex','cp','fbs','restecg','exng','slp','caa','thall']
num_cols = ['age','trtbps','chol','thalachh','oldpeak']
target = ['output']

print('Categorical columns:',cat_cols)
print('Numerical columns:  ',num_cols)
print('Target:             ',target)

Categorical columns: ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']
Numerical columns:   ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
Target:              ['output']


In [12]:
heart_df[num_cols].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,303.0,54.366337,9.082101,29.0,47.5,55.0,61.0,77.0
trtbps,303.0,131.623762,17.538143,94.0,120.0,130.0,140.0,200.0
chol,303.0,246.264026,51.830751,126.0,211.0,240.0,274.5,564.0
thalachh,303.0,149.646865,22.905161,71.0,133.5,153.0,166.0,202.0
oldpeak,303.0,1.039604,1.161075,0.0,0.0,0.8,1.6,6.2


In [13]:
num_cols_df = heart_df[num_cols].corr().transpose()
num_cols_df

Unnamed: 0,age,trtbps,chol,thalachh,oldpeak
age,1.0,0.279351,0.213678,-0.398522,0.210013
trtbps,0.279351,1.0,0.123174,-0.046698,0.193216
chol,0.213678,0.123174,1.0,-0.00994,0.053952
thalachh,-0.398522,-0.046698,-0.00994,1.0,-0.344187
oldpeak,0.210013,0.193216,0.053952,-0.344187,1.0
