In [1]:
#import dependancies
import pandas as pd

from sqlalchemy import create_engine
from config import dbConnectionString

In [3]:
# Create SQLEngine -- use "pip install psycopg2-binary" if ModuleError occurs
engine = create_engine(dbConnectionString)

In [8]:
#load in datasets from sql database
df_exam = pd.read_sql_query('select * from "examinCardio"',con=engine)
df_obj = pd.read_sql_query('select * from "objCardio"',con=engine)
df_subj = pd.read_sql_query('select * from "subjCardio"',con=engine)

In [9]:
#convert age from days to years
df_obj['age'] = df_obj['age'] / 365
df_obj['age'] = df_obj['age'].map("{:.1f}".format)
df_obj

Unnamed: 0,id,age,gender,height,weight,cardiovascular
0,0,50.4,Female,168.0,62.0,False
1,1,55.4,Male,156.0,85.0,True
2,2,51.7,Male,165.0,64.0,True
3,3,48.3,Female,169.0,82.0,True
4,4,47.9,Male,156.0,56.0,False
...,...,...,...,...,...,...
69995,99993,52.7,Female,168.0,76.0,False
69996,99995,61.9,Male,158.0,126.0,True
69997,99996,52.2,Female,183.0,105.0,True
69998,99998,61.5,Male,163.0,72.0,True


In [10]:
#removing outliers - based on medically salient information
# Removing  outliers in the 'weight' coulmn that less than 30 Kg
df_obj.drop(df_obj[df_obj['weight'] < 30].index, inplace = True)
# Removing  outliers in the 'height' coulmn that are less than  100
df_obj.drop(df_obj[df_obj['height'] < 100].index, inplace = True)
# Removing  outliers in the 'height' coulmn that are greater than  210
df_obj.drop(df_obj[df_obj['height'] > 210].index, inplace = True)

df_obj

Unnamed: 0,id,age,gender,height,weight,cardiovascular
0,0,50.4,Female,168.0,62.0,False
1,1,55.4,Male,156.0,85.0,True
2,2,51.7,Male,165.0,64.0,True
3,3,48.3,Female,169.0,82.0,True
4,4,47.9,Male,156.0,56.0,False
...,...,...,...,...,...,...
69995,99993,52.7,Female,168.0,76.0,False
69996,99995,61.9,Male,158.0,126.0,True
69997,99996,52.2,Female,183.0,105.0,True
69998,99998,61.5,Male,163.0,72.0,True


In [11]:
# Remove outliers in the 'ap_hi' coulmn that are less than 60
df_exam.drop(df_exam[df_exam['ap_hi'] < 60].index, inplace = True)
# Remove outliers in the 'ap_lo' coulmn that are above 300
df_exam.drop(df_exam[df_exam['ap_hi'] > 300].index, inplace = True)

# Remove outliers in the 'ap_lo' coulmn that are below 10
df_exam.drop(df_exam[df_exam['ap_lo'] < 10].index, inplace = True)
# # Remove outliers in the 'ap_lo' coulmn that are greater than 100
df_exam.drop(df_exam[df_exam['ap_lo'] >= 100].index, inplace = True)

df_exam

Unnamed: 0,id,ap_hi,ap_lo,cholestrol,gloucose
0,0,110,80,normal,normal
1,1,140,90,well above normal,normal
2,2,130,70,well above normal,normal
4,4,100,60,normal,normal
5,8,120,80,above normal,above normal
...,...,...,...,...,...
69995,99993,120,80,normal,normal
69996,99995,140,90,above normal,above normal
69997,99996,180,90,well above normal,normal
69998,99998,135,80,normal,well above normal


In [12]:
#create BMI datapoint
#BMI = (weight x weight)/(height)

df_obj["BMI"] = (df_obj['weight']*df_obj['weight'])/ df_obj['height']

df_obj['BMI'] = df_obj['BMI'].map("{:.0f}".format)
df_obj

Unnamed: 0,id,age,gender,height,weight,cardiovascular,BMI
0,0,50.4,Female,168.0,62.0,False,23
1,1,55.4,Male,156.0,85.0,True,46
2,2,51.7,Male,165.0,64.0,True,25
3,3,48.3,Female,169.0,82.0,True,40
4,4,47.9,Male,156.0,56.0,False,20
...,...,...,...,...,...,...,...
69995,99993,52.7,Female,168.0,76.0,False,34
69996,99995,61.9,Male,158.0,126.0,True,100
69997,99996,52.2,Female,183.0,105.0,True,60
69998,99998,61.5,Male,163.0,72.0,True,32


In [16]:
# Merge all data together
merged_cardio_df = pd.merge(pd.merge(df_obj, df_exam, on=['id'],how='inner') ,df_subj, on=['id'], how='inner')
merged_cardio_df

Unnamed: 0,id,age,gender,height,weight,cardiovascular,BMI,ap_hi,ap_lo,cholestrol,gloucose,smoke,alchohol,active
0,0,50.4,Female,168.0,62.0,False,23,110,80,normal,normal,False,False,True
1,1,55.4,Male,156.0,85.0,True,46,140,90,well above normal,normal,False,False,True
2,2,51.7,Male,165.0,64.0,True,25,130,70,well above normal,normal,False,False,False
3,4,47.9,Male,156.0,56.0,False,20,100,60,normal,normal,False,False,False
4,8,60.0,Male,151.0,67.0,False,30,120,80,above normal,above normal,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63911,99993,52.7,Female,168.0,76.0,False,34,120,80,normal,normal,True,False,True
63912,99995,61.9,Male,158.0,126.0,True,100,140,90,above normal,above normal,False,False,True
63913,99996,52.2,Female,183.0,105.0,True,60,180,90,well above normal,normal,False,True,False
63914,99998,61.5,Male,163.0,72.0,True,32,135,80,normal,well above normal,False,False,False


In [18]:
merged_cardio_df.to_sql('cardioData',engine, if_exists="replace")