In [1]:
import pandas as pd
import numpy as np

In [2]:
path = 'data/cardio_train.csv'
df = pd.read_csv(path, 
                 delimiter=';',
                 index_col='id', header=0)

### Description of the different variables that will be analyzed in this notebook
- Age is in days.
- Gender: 1 - women, 2 - men
- Height: cm
- Weight: km
- ap_hi: Systolic Blood Pressure
- ap_lo: Diastolic Blood Pressure
- Cholesterol: 1-normal, 2-above normal, 3-well above normal
- Gluc: normal, 2: above normal, 3: well above normal
- Smoke - Binary
- Alch - Binary
- Cardio - Binary
*For the binary variables 0 is no and 1 is yes*


# Data Inspection

In [3]:
df.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70000 entries, 0 to 99999
Data columns (total 12 columns):
age            70000 non-null int64
gender         70000 non-null int64
height         70000 non-null int64
weight         70000 non-null float64
ap_hi          70000 non-null int64
ap_lo          70000 non-null int64
cholesterol    70000 non-null int64
gluc           70000 non-null int64
smoke          70000 non-null int64
alco           70000 non-null int64
active         70000 non-null int64
cardio         70000 non-null int64
dtypes: float64(1), int64(11)
memory usage: 6.9 MB


In [5]:
df.shape

(70000, 12)

In [6]:
df.isna().values.any() #Check to see if there are any NA Values

False

In [7]:
#How many unactive smokers are there in the dataset?
unactive_smokers = df.loc[(df.active == 0) & 
                                  (df.smoke == 1)]
len(unactive_smokers)

1007

In [16]:
#What percent of CV patients are smokers in this data set

CV_pts = df.loc[(df.cardio == 1)]
smoke_cv_pts = CV_pts.loc[(CV_pts.smoke == 1)]

len(smoke_cv_pts) / len(df.loc[(df.cardio == 1)])

0.08373595585922983

In [23]:
# What percent of CV patients are unactive v unactive
active_pt = CV_pts.loc[(CV_pts.smoke == 1)]
unactive_pt = CV_pts.loc[(CV_pts.smoke == 0)]

unactive_pct = len(unactive_pt) / len(CV_pts) * 100
active_pvt = len(active_pt) / len(CV_pts) * 100

print(f'{unactive_pct} % of patients with Cardiovascular disease are unactive and {active_pvt} % of patients with cardiovascular disease are active')

91.62640441407703 % of patients with Cardiovascular disease are unactive and 8.373595585922983 % of patients with cardiovascular disease are active


In [25]:
#What percent of unactive smokers have cardio disease?

pct = len(unactive_smokers.loc[(df.cardio == 1)]) / len(unactive_smokers) * 100

print(f'{pct}% of patients with Cardiovascular diseasea are both unactive and smokers.')

55.41211519364448% of patients with Cardiovascular diseasea are both unactive and smokers.
