In [1]:
# Import depndencies 
import pandas as pd 
import numpy as np 

In [2]:
# Loading the files 
file_path = "s3://cardiovasular-data/raw/cardio_train.csv"

In [4]:
# Read the data file and store it in a Pandas DataFrame.
cardio_df = pd.read_csv(file_path, delimiter=";")
cardio_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [5]:
# Formatting the year as the whole number
cardio_df['New_age'] = cardio_df['age'].map("{:.0f}".format)

In [6]:
# Format the age /365
cardio_df['New_age'] = cardio_df['age'] / 365

# Make the whole number
cardio_df['New_age'] = cardio_df['New_age'].map("{:.0f}".format)

# replace 'age' column.
cardio_df.drop('age', axis=1, inplace=True)

# Rearrange the columns
cardio_df = cardio_df[['id', 'New_age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 
                                    'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']] 

# Rename the new 'age' column 
cardio_df = cardio_df.rename(columns = {"New_age":"Age"}) 
cardio_df.head()

Unnamed: 0,id,Age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,2,52,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,4,48,1,156,56.0,100,60,1,1,0,0,0,0


In [7]:
# Check data types
cardio_df.dtypes

id               int64
Age             object
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object

In [8]:
cardio_df.head(10)

Unnamed: 0,id,Age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,2,52,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,4,48,1,156,56.0,100,60,1,1,0,0,0,0
5,8,60,1,151,67.0,120,80,2,2,0,0,0,0
6,9,61,1,157,93.0,130,80,3,1,0,0,1,0
7,12,62,2,178,95.0,130,90,3,3,0,0,1,1
8,13,48,1,158,71.0,110,70,1,1,0,0,1,0
9,14,54,1,164,68.0,110,60,1,1,0,0,0,0


In [9]:
# To make the data clean remove  outliers because large set of data in the important catagoreis 
# Note I need to do research to understand alco, ap_lo, ap_hi , cardio in this schedule what does it mean .?

In [10]:
# ap hi maximum heart rate is around 220 beats per minute and the minumum is 100
#  Increasingly, experts pin an ideal resting heart rate at between 50 to 70 beats per minute

# Removing  outliers in the 'ap_hi' coulmn that are above 220
cardio_df.drop(cardio_df[cardio_df['ap_hi'] > 220].index, inplace = True)

# Remove outliers in the 'ap_hi' coulmn that are below 100
cardio_df.drop(cardio_df[cardio_df['ap_hi'] < 100].index, inplace = True)

In [11]:
# ap lo maximum heart rate is around 150 beats per minute and the minumum is 65
# Removing  outliers in the 'ap_lo' coulmn that are above  150
cardio_df.drop(cardio_df[cardio_df['ap_lo'] > 140].index, inplace = True)
# Remove outliers in the 'ap_lo' coulmn that are below 65
cardio_df.drop(cardio_df[cardio_df['ap_lo'] < 60].index, inplace = True)

In [12]:
cardio_df.head(10)

Unnamed: 0,id,Age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,2,52,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,4,48,1,156,56.0,100,60,1,1,0,0,0,0
5,8,60,1,151,67.0,120,80,2,2,0,0,0,0
6,9,61,1,157,93.0,130,80,3,1,0,0,1,0
7,12,62,2,178,95.0,130,90,3,3,0,0,1,1
8,13,48,1,158,71.0,110,70,1,1,0,0,1,0
9,14,54,1,164,68.0,110,60,1,1,0,0,0,0


In [13]:
# Converting  'height' column from cm to feet each cm = 0.032808399 foot

cardio_df['height'] = round(cardio_df['height']*0.0328084, 2)

In [14]:
# Converting 'weight' from kg to lb  1kg = 2.20462 lbs
cardio_df['weight'] = round(cardio_df['weight'] * 2.20462, 1)

In [15]:
cardio_df.head(10)

Unnamed: 0,id,Age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,5.51,136.7,110,80,1,1,0,0,1,0
1,1,55,1,5.12,187.4,140,90,3,1,0,0,1,1
2,2,52,1,5.41,141.1,130,70,3,1,0,0,0,1
3,3,48,2,5.54,180.8,150,100,1,1,0,0,1,1
4,4,48,1,5.12,123.5,100,60,1,1,0,0,0,0
5,8,60,1,4.95,147.7,120,80,2,2,0,0,0,0
6,9,61,1,5.15,205.0,130,80,3,1,0,0,1,0
7,12,62,2,5.84,209.4,130,90,3,3,0,0,1,1
8,13,48,1,5.18,156.5,110,70,1,1,0,0,1,0
9,14,54,1,5.38,149.9,110,60,1,1,0,0,0,0


In [16]:
cardio_df.shape

(67524, 13)

In [17]:
cardio_df.to_csv("cardio_train_output.csv",header=False, index=False)