In [1]:
import zipfile
from google.colab import drive

drive.mount('/content/drive/')
zip_ref = zipfile.ZipFile("/content/drive/MyDrive/cardiovascular disease.zip", 'r')
zip_ref.extractall("/Cardiovascular disease")
zip_ref.close()

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
#Define the goal of the datascience project
#Explore the data
#Clean the data
#Model the data
#Evaluate the model
#Deploy the model

In [3]:
import pandas as pd
import numpy as np
import math

In [4]:
df  =  pd.read_csv('/Cardiovascular disease/cardio_train.csv', sep=';')
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [5]:
df.columns

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

In [6]:
#Level 1 cleaning
#Rename ambigous feature attributes
df.rename({'ap_hi':'systolic_bp', 'ap_lo':'diastolic_bp', 'gluc':'glucose',
           'alco':'alcohol', 'active':'physical_activity','cardio':'cardiovascular_disease' }, axis=1, inplace=True)

df.columns

Index(['id', 'age', 'gender', 'height', 'weight', 'systolic_bp',
       'diastolic_bp', 'cholesterol', 'glucose', 'smoke', 'alcohol',
       'physical_activity', 'cardiovascular_disease'],
      dtype='object')

In [7]:
#Careful inspection reveals that the age has been converted to days.
#To make it more readable, we shall convert it to years

def ConvertAgeToYears(age):
  return math.floor(age / 365)

df.age = df.age.apply(ConvertAgeToYears)

In [8]:
#Convert height from cm to m

def ConvertToMeter(h):
  return (h / 100)

df.height = df.height.apply(ConvertToMeter)

In [9]:
df.age

0        50
1        55
2        51
3        48
4        47
         ..
69995    52
69996    61
69997    52
69998    61
69999    56
Name: age, Length: 70000, dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      70000 non-null  int64  
 1   age                     70000 non-null  int64  
 2   gender                  70000 non-null  int64  
 3   height                  70000 non-null  float64
 4   weight                  70000 non-null  float64
 5   systolic_bp             70000 non-null  int64  
 6   diastolic_bp            70000 non-null  int64  
 7   cholesterol             70000 non-null  int64  
 8   glucose                 70000 non-null  int64  
 9   smoke                   70000 non-null  int64  
 10  alcohol                 70000 non-null  int64  
 11  physical_activity       70000 non-null  int64  
 12  cardiovascular_disease  70000 non-null  int64  
dtypes: float64(2), int64(11)
memory usage: 6.9 MB


In [11]:
#We also need to cast certain attributes from integers to strings
#['sex','smoke','alcohol','physical activity', 'cardiovascular_disease']

def CastToString(v):
  return str(v)
  
df.smoke = df.smoke.apply(CastToString)
df.alcohol = df.alcohol.apply(CastToString)
df.physical_activity = df.physical_activity.apply(CastToString)
df.cardiovascular_disease = df.cardiovascular_disease.apply(CastToString)
df.gender = df.gender.apply(CastToString)


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      70000 non-null  int64  
 1   age                     70000 non-null  int64  
 2   gender                  70000 non-null  object 
 3   height                  70000 non-null  float64
 4   weight                  70000 non-null  float64
 5   systolic_bp             70000 non-null  int64  
 6   diastolic_bp            70000 non-null  int64  
 7   cholesterol             70000 non-null  int64  
 8   glucose                 70000 non-null  int64  
 9   smoke                   70000 non-null  object 
 10  alcohol                 70000 non-null  object 
 11  physical_activity       70000 non-null  object 
 12  cardiovascular_disease  70000 non-null  object 
dtypes: float64(2), int64(6), object(5)
memory usage: 6.9+ MB


In [13]:
#Summary of each numerical attribute

df.describe()

Unnamed: 0,id,age,height,weight,systolic_bp,diastolic_bp,cholesterol,glucose
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,52.840671,1.643592,74.20569,128.817286,96.630414,1.366871,1.226457
std,28851.302323,6.766774,0.082101,14.395757,154.011419,188.47253,0.68025,0.57227
min,0.0,29.0,0.55,10.0,-150.0,-70.0,1.0,1.0
25%,25006.75,48.0,1.59,65.0,120.0,80.0,1.0,1.0
50%,50001.5,53.0,1.65,72.0,120.0,80.0,1.0,1.0
75%,74889.25,58.0,1.7,82.0,140.0,90.0,2.0,1.0
max,99999.0,64.0,2.5,200.0,16020.0,11000.0,3.0,3.0


In [14]:
#Data analysis
#looking for correlations via standard correlation coefficient
#NB correlation does not equate to causation
corr_matrix = df.corr()
corr_matrix['cholesterol'].sort_values(ascending=False)

cholesterol     1.000000
glucose         0.451578
age             0.154012
weight          0.141768
diastolic_bp    0.024019
systolic_bp     0.023778
id              0.006106
height         -0.050226
Name: cholesterol, dtype: float64

In [15]:
corr_matrix['glucose'].sort_values(ascending=False)


glucose         1.000000
cholesterol     0.451578
weight          0.106857
age             0.098388
systolic_bp     0.011841
diastolic_bp    0.010806
id              0.002467
height         -0.018595
Name: glucose, dtype: float64

In [16]:
corr_matrix['systolic_bp'].sort_values(ascending=False)

systolic_bp     1.000000
weight          0.030702
cholesterol     0.023778
age             0.020854
diastolic_bp    0.016086
glucose         0.011841
height          0.005488
id              0.003356
Name: systolic_bp, dtype: float64

In [17]:
df.head()

Unnamed: 0,id,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,glucose,smoke,alcohol,physical_activity,cardiovascular_disease
0,0,50,2,1.68,62.0,110,80,1,1,0,0,1,0
1,1,55,1,1.56,85.0,140,90,3,1,0,0,1,1
2,2,51,1,1.65,64.0,130,70,3,1,0,0,0,1
3,3,48,2,1.69,82.0,150,100,1,1,0,0,1,1
4,4,47,1,1.56,56.0,100,60,1,1,0,0,0,0


In [18]:
#Experimenting with Attribute Combinations
df['BMI'] = df ['weight'] / df['height'] / df ['height']

In [22]:
corr_matrix = df.corr()
corr_matrix['BMI'].sort_values(ascending=False)

BMI             1.000000
weight          0.762009
cholesterol     0.146270
glucose         0.101390
age             0.085404
diastolic_bp    0.035345
systolic_bp     0.024852
id             -0.001377
height         -0.290642
Name: BMI, dtype: float64

In [23]:
df.isnull().value_counts()

id     age    gender  height  weight  systolic_bp  diastolic_bp  cholesterol  glucose  smoke  alcohol  physical_activity  cardiovascular_disease  BMI  
False  False  False   False   False   False        False         False        False    False  False    False              False                   False    70000
dtype: int64

In [24]:
#Handling Categorical Attributes
df_cat = df[['gender','smoke','alcohol','physical_activity']]
df_cat.head(5)

Unnamed: 0,gender,smoke,alcohol,physical_activity
0,2,0,0,1
1,1,0,0,1
2,1,0,0,0
3,2,0,0,1
4,1,0,0,0


In [25]:
label = df['cardiovascular_disease']

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, label, test_size=0.2, random_state=2)

In [27]:
X_train.head(3)

Unnamed: 0,id,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,glucose,smoke,alcohol,physical_activity,cardiovascular_disease,BMI
59452,84886,56,1,1.67,74.0,150,80,1,1,0,0,0,1,26.533759
65441,93409,39,1,1.58,89.0,190,1000,1,1,0,0,0,1,35.651338
51371,73267,58,1,1.68,127.0,150,90,3,3,0,0,1,0,44.997166


In [28]:
df1 = X_train.drop(["cardiovascular_disease","id"], axis=1)
X_test = X_test.drop(["cardiovascular_disease","id"], axis=1)


In [30]:
#Handle both categorical and numerical transformations at the same time
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [31]:
num_pipeline = Pipeline([('std_scaler', StandardScaler()),
 ])

In [32]:
df1_num = df1.drop(['gender','smoke','alcohol','physical_activity'], axis=1)
num_attribs = list(df1_num)
cat_attribs = ['gender','smoke','alcohol','physical_activity']
full_pipeline = ColumnTransformer([
 ('num', num_pipeline, num_attribs),
 ('cat', OneHotEncoder(), cat_attribs),
 ])
df1_prepared = full_pipeline.fit_transform(df1)

In [33]:
#Developing the model using a Random Forest Classifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
param_grid=[{'n_estimators':[50, 60, 70], 'max_features':[2, 4, 6]}]

grid_search = GridSearchCV(model, param_grid,cv=5, scoring='balanced_accuracy', return_train_score=True
)

grid_search.fit(df1, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid=[{'max_features': [2, 4, 6],
                          'n_estimators': [50, 60, 70]}],
             return_train_score=True, scoring='balanced_accuracy')

In [34]:
#To obtain the best training parameters
grid_search.best_estimator_

RandomForestClassifier(max_features=2, n_estimators=70)

In [35]:
#Evaluating the model on the test set
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(X_test)

In [36]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
confusion_matrix(y_test,final_predictions)


array([[5033, 1996],
       [2073, 4898]])

In [40]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test,final_predictions)

0.7093293682856883

In [38]:
 from sklearn.metrics import classification_report
 print(classification_report(y_test, final_predictions, labels=[0, 1]))

  mask &= (ar1 != a)
  mask &= (ar1 != a)


              precision    recall  f1-score   support

           0       0.71      0.72      0.71      7029
           1       0.71      0.70      0.71      6971

   micro avg       0.71      0.71      0.71     14000
   macro avg       0.71      0.71      0.71     14000
weighted avg       0.71      0.71      0.71     14000



  mask &= (ar1 != a)
  mask &= (ar1 != a)
