In [1]:
# Import depndencies 
import pandas as pd 
import numpy as np 
import os
from functools import reduce

In [2]:
# Loading the files 
file_path1 = "Resources/CleanedObj.csv"

# Read the data file and store it in a Pandas DataFrame.
df1 = pd.read_csv(file_path1)
df1.head()

Unnamed: 0,id,age,gender,height,weight,cardiovascular_disease,BMI
0,0,50,Female,5.51,136.7,No,23
1,1,55,Male,5.12,187.4,Yes,46
2,2,52,Male,5.41,141.1,Yes,25
3,3,48,Female,5.54,180.8,Yes,40
4,4,48,Male,5.12,123.5,No,20


In [3]:
# Loading the files 
file_path2 = "Resources/subjective_feature_cardio.csv"

# Read the data file and store it in a Pandas DataFrame.
df2 = pd.read_csv(file_path2)
df2.head()

Unnamed: 0,id,smoke,alco,active,cardio
0,0,0,0,1,0
1,1,0,0,1,1
2,2,0,0,0,1
3,3,0,0,1,1
4,4,0,0,0,0


In [4]:
# Loading the files 
file_path3 = "Resources/Cleaned_exami_cardio.csv"

# Read the data file and store it in a Pandas DataFrame.
df3 = pd.read_csv(file_path3)
df3.head()

Unnamed: 0,id,ap_hi,ap_lo,cholesterol,glucose
0,0,110,80,normal,normal
1,1,140,90,well above normal,normal
2,2,130,70,well above normal,normal
3,4,100,60,normal,normal
4,8,120,80,above normal,above normal


In [5]:
data_frames = [df1, df2,df3]

In [6]:
# Merging the columns some they have some common column, like id
df_merged = reduce(lambda  left,right: pd.merge(left ,right,on=['id'],
                                            how='outer'), data_frames)
                                            

In [7]:
df_merged.head()

Unnamed: 0,id,age,gender,height,weight,cardiovascular_disease,BMI,smoke,alco,active,cardio,ap_hi,ap_lo,cholesterol,glucose
0,0,50.0,Female,5.51,136.7,No,23.0,0,0,1,0,110.0,80.0,normal,normal
1,1,55.0,Male,5.12,187.4,Yes,46.0,0,0,1,1,140.0,90.0,well above normal,normal
2,2,52.0,Male,5.41,141.1,Yes,25.0,0,0,0,1,130.0,70.0,well above normal,normal
3,3,48.0,Female,5.54,180.8,Yes,40.0,0,0,1,1,,,,
4,4,48.0,Male,5.12,123.5,No,20.0,0,0,0,0,100.0,60.0,normal,normal


In [8]:
# Using dummy function to Convert Categorical Values to Binary to make it easier for ML  otherwise we will be facing 
# ValueError: could not convert string to float
dummy = pd.get_dummies(df_merged['gender'])
dummy.head()

Unnamed: 0,Female,Male
0,1,0
1,0,1
2,0,1
3,1,0
4,0,1


In [9]:
# Merging dummy data into the dataFrame to concatenate DataFrames
df_merged2 =pd.concat((df_merged, dummy), axis=1 )
df_merged2.head()

Unnamed: 0,id,age,gender,height,weight,cardiovascular_disease,BMI,smoke,alco,active,cardio,ap_hi,ap_lo,cholesterol,glucose,Female,Male
0,0,50.0,Female,5.51,136.7,No,23.0,0,0,1,0,110.0,80.0,normal,normal,1,0
1,1,55.0,Male,5.12,187.4,Yes,46.0,0,0,1,1,140.0,90.0,well above normal,normal,0,1
2,2,52.0,Male,5.41,141.1,Yes,25.0,0,0,0,1,130.0,70.0,well above normal,normal,0,1
3,3,48.0,Female,5.54,180.8,Yes,40.0,0,0,1,1,,,,,1,0
4,4,48.0,Male,5.12,123.5,No,20.0,0,0,0,0,100.0,60.0,normal,normal,0,1


In [10]:
# Dropping gender column and female to set up one column for gender  1 = male / 0 = Female
df_merged2.drop(['gender'] , axis=1, inplace=True)
df_merged2 = df_merged2.drop(['Female'], axis=1)
df_merged2 = df_merged2.rename(columns={"Male":"gender"})

In [11]:
df_merged2.head()

Unnamed: 0,id,age,height,weight,cardiovascular_disease,BMI,smoke,alco,active,cardio,ap_hi,ap_lo,cholesterol,glucose,gender
0,0,50.0,5.51,136.7,No,23.0,0,0,1,0,110.0,80.0,normal,normal,0
1,1,55.0,5.12,187.4,Yes,46.0,0,0,1,1,140.0,90.0,well above normal,normal,1
2,2,52.0,5.41,141.1,Yes,25.0,0,0,0,1,130.0,70.0,well above normal,normal,1
3,3,48.0,5.54,180.8,Yes,40.0,0,0,1,1,,,,,0
4,4,48.0,5.12,123.5,No,20.0,0,0,0,0,100.0,60.0,normal,normal,1


In [12]:
# Set cholesterol column for   normal = 0 / above normal = 1 / well above normal = 2

df_merged2['cholesterol']

0                   normal
1        well above normal
2        well above normal
3                      NaN
4                   normal
               ...        
69995         above normal
69996               normal
69997                  NaN
69998               normal
69999               normal
Name: cholesterol, Length: 70000, dtype: object

In [13]:
# Creating a function  with If condition
def cholestrol_transform(x):
    if x == 'normal':
        return 0
    if x == 'above normal':
        return 1 
    if x == 'well above normal':
        return 2

In [14]:
# Adding new column and replacing  with 0 , 1 , 2 values normal, above normal,well above normal to make it ready for ML
df_merged2['cholestrol_transform'] = df_merged2['cholesterol'].apply(cholestrol_transform)

df_merged2.drop(['cholesterol'] , axis=1, inplace=True)
df_merged2 = df_merged2.rename(columns={ "cholestrol_transform" : "cholesterol"})

In [15]:
df_merged2.head()

Unnamed: 0,id,age,height,weight,cardiovascular_disease,BMI,smoke,alco,active,cardio,ap_hi,ap_lo,glucose,gender,cholesterol
0,0,50.0,5.51,136.7,No,23.0,0,0,1,0,110.0,80.0,normal,0,0.0
1,1,55.0,5.12,187.4,Yes,46.0,0,0,1,1,140.0,90.0,normal,1,2.0
2,2,52.0,5.41,141.1,Yes,25.0,0,0,0,1,130.0,70.0,normal,1,2.0
3,3,48.0,5.54,180.8,Yes,40.0,0,0,1,1,,,,0,
4,4,48.0,5.12,123.5,No,20.0,0,0,0,0,100.0,60.0,normal,1,0.0


In [16]:
# Set glucose column    normal = 0 

df_merged2['glucose']

0        normal
1        normal
2        normal
3           NaN
4        normal
          ...  
69995    normal
69996    normal
69997       NaN
69998    normal
69999    normal
Name: glucose, Length: 70000, dtype: object

In [17]:
# Creating a function  with If condition
def glucose_transform(x):
    if x == 'normal':
        return 0

In [18]:
# Adding new column and replacing  with 0 == normal to make it ready for ML
df_merged2['glucose_transform'] = df_merged2['glucose'].apply(cholestrol_transform)

df_merged2.drop(['glucose'] , axis=1, inplace=True)
df_merged2 = df_merged2.rename(columns={"glucose_transform":"glucose"})

In [19]:
df_merged2.head()

Unnamed: 0,id,age,height,weight,cardiovascular_disease,BMI,smoke,alco,active,cardio,ap_hi,ap_lo,gender,cholesterol,glucose
0,0,50.0,5.51,136.7,No,23.0,0,0,1,0,110.0,80.0,0,0.0,0.0
1,1,55.0,5.12,187.4,Yes,46.0,0,0,1,1,140.0,90.0,1,2.0,0.0
2,2,52.0,5.41,141.1,Yes,25.0,0,0,0,1,130.0,70.0,1,2.0,0.0
3,3,48.0,5.54,180.8,Yes,40.0,0,0,1,1,,,0,,
4,4,48.0,5.12,123.5,No,20.0,0,0,0,0,100.0,60.0,1,0.0,0.0


In [20]:
# Replace Nan values with zeroes to make it easier for machine learning 
df_merged2['age']=df_merged2['age'].fillna(0)
df_merged2['gender']=df_merged2['gender'].fillna(0)
df_merged2['height']=df_merged2['height'].fillna(0)
df_merged2['weight']=df_merged2['weight'].fillna(0)
df_merged2['BMI']=df_merged2['BMI'].fillna(0)
df_merged2['smoke']=df_merged2['smoke'].fillna(0)
df_merged2['alco']=df_merged2['alco'].fillna(0)
df_merged2['active']=df_merged2['active'].fillna(0)
df_merged2['cardio']=df_merged2['cardio'].fillna(0)
df_merged2['ap_hi']=df_merged2['ap_hi'].fillna(0)
df_merged2['ap_lo']=df_merged2['ap_lo'].fillna(0)
df_merged2['cholesterol']=df_merged2['cholesterol'].fillna(0)
df_merged2['glucose']=df_merged2['glucose'].fillna(0)

In [21]:
# Rearrange the columns
df_merged2 = df_merged2[['id', 'age', 'gender', 'height', 'weight','BMI','ap_hi','ap_lo','cholesterol','glucose' ,'smoke', 'alco','active','cardio' ]]

In [22]:
df_merged2.head()

Unnamed: 0,id,age,gender,height,weight,BMI,ap_hi,ap_lo,cholesterol,glucose,smoke,alco,active,cardio
0,0,50.0,0,5.51,136.7,23.0,110.0,80.0,0.0,0.0,0,0,1,0
1,1,55.0,1,5.12,187.4,46.0,140.0,90.0,2.0,0.0,0,0,1,1
2,2,52.0,1,5.41,141.1,25.0,130.0,70.0,2.0,0.0,0,0,0,1
3,3,48.0,0,5.54,180.8,40.0,0.0,0.0,0.0,0.0,0,0,1,1
4,4,48.0,1,5.12,123.5,20.0,100.0,60.0,0.0,0.0,0,0,0,0


In [24]:
df_merged2.to_csv(r'Resources\Merged.csv', index = False)