In [1]:
# https://www.kaggle.com/datasets/sulianova/cardiovascular-disease-dataset/

In [2]:
import sklearn
import pandas
import numpy

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report

In [4]:
# Read in the dataset, delimit using ';''

dataframe = pandas.read_csv('cardio_train.csv', delimiter=';')

In [5]:
# Drop the first column, since it's just ID values

dataframe.drop('id', axis=1, inplace=True)

In [6]:
# Convert the dataset into a numpy array and scramble it

dataset = dataframe.to_numpy()
numpy.random.shuffle(dataset)

In [7]:
# Split the data into training, validation, test sets (80%, 10%, 10%)

train_set, rest = numpy.split(dataset, [int(len(dataset) * 0.8)])
valid_set, test_set = numpy.split(rest, 2)

In [8]:
# Function to create our feature and label sets (input 2D numpy array)

def getFeaturesAndLabels(data):
    features = []
    labels = []
    
    for item in data:
        temp = []
        for index, value in enumerate(item):
            # We want 0/1 instead of 1/2 for male/female because it should be categorical
            if (index == 1):
                temp.append(value % 2)
            elif (index < 11):
                temp.append(value)
            else:
                labels.append(value)
        features.append(temp)
        
    return features, labels

In [9]:
# Split the sets into features and labels
train_features, train_labels = getFeaturesAndLabels(train_set)
valid_features, valid_labels = getFeaturesAndLabels(valid_set)
test_features, test_labels = getFeaturesAndLabels(test_set)

In [10]:
dataframe

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [12]:
# Baseline Model

In [13]:
dummy = DummyClassifier()

# Fit the model
dummy.fit(train_features, train_labels)

# Get the array of predicted values from the dummy model
predictions = dummy.predict(valid_features)

# Generate a classification report
# ********************************

y_true = valid_labels
y_pred = predictions

target_names = ['False', 'True']

print('Classification Report:')
print('**********************')
print(classification_report(y_true, y_pred, target_names=target_names, digits=5))

Classification Report:
**********************
              precision    recall  f1-score   support

       False    0.00000   0.00000   0.00000      3543
        True    0.49386   1.00000   0.66118      3457

    accuracy                        0.49386      7000
   macro avg    0.24693   0.50000   0.33059      7000
weighted avg    0.24389   0.49386   0.32653      7000



In [14]:
# Testing Different Classifiers

In [15]:
# Create a K-Neighbors Classifier

model = KNeighborsClassifier()

# Fit the model
model.fit(train_features, train_labels)

# Get the array of predicted values from the model
predictions = model.predict(valid_features)

# Generate a classification report
# ********************************

y_true = valid_labels
y_pred = predictions

target_names = ['False', 'True']

print('Classification Report:')
print('**********************')
print(classification_report(y_true, y_pred, target_names=target_names, digits=5))

Classification Report:
**********************
              precision    recall  f1-score   support

       False    0.68023   0.68868   0.68443      3543
        True    0.67682   0.66821   0.67249      3457

    accuracy                        0.67857      7000
   macro avg    0.67853   0.67845   0.67846      7000
weighted avg    0.67855   0.67857   0.67853      7000



In [16]:
# Create a Logistic Regression Classifier

model = LogisticRegression(max_iter=300)

# Fit the model
model.fit(train_features, train_labels)

# Get the array of predicted values from the model
predictions = model.predict(valid_features)

# Generate a classification report
# ********************************

y_true = valid_labels
y_pred = predictions

target_names = ['False', 'True']

print('Classification Report:')
print('**********************')
print(classification_report(y_true, y_pred, target_names=target_names, digits=5))

Classification Report:
**********************
              precision    recall  f1-score   support

       False    0.68789   0.71973   0.70345      3543
        True    0.69845   0.66532   0.68148      3457

    accuracy                        0.69286      7000
   macro avg    0.69317   0.69252   0.69246      7000
weighted avg    0.69310   0.69286   0.69260      7000



In [17]:
# Create a Gaussian Naive Bayes Classifier
model = GaussianNB()

# Fit the model
model.fit(train_features, train_labels)

# Get the array of predicted values from the model
predictions = model.predict(valid_features)

# Generate a classification report
# ********************************

y_true = valid_labels
y_pred = predictions

target_names = ['False', 'True']

print('Classification Report:')
print('**********************')
print(classification_report(y_true, y_pred, target_names=target_names, digits=5))

Classification Report:
**********************
              precision    recall  f1-score   support

       False    0.56463   0.88400   0.68911      3543
        True    0.71714   0.30142   0.42444      3457

    accuracy                        0.59629      7000
   macro avg    0.64088   0.59271   0.55677      7000
weighted avg    0.63995   0.59629   0.55840      7000



In [18]:
# Create a Random Forest Classifier
model = RandomForestClassifier()

# Fit the model
model.fit(train_features, train_labels)

# Get the array of predicted values from the model
predictions = model.predict(valid_features)

# Generate a classification report
# ********************************

y_true = valid_labels
y_pred = predictions

target_names = ['False', 'True']

print('Classification Report:')
print('**********************')
print(classification_report(y_true, y_pred, target_names=target_names, digits=5))

Classification Report:
**********************
              precision    recall  f1-score   support

       False    0.72020   0.72650   0.72334      3543
        True    0.71716   0.71073   0.71393      3457

    accuracy                        0.71871      7000
   macro avg    0.71868   0.71862   0.71864      7000
weighted avg    0.71870   0.71871   0.71869      7000



In [19]:
# Create a Support Vector Classifer
model = SVC()

# Fit the model
model.fit(train_features, train_labels)

# Get the array of predicted values from the model
predictions = model.predict(valid_features)

# Generate a classification report
# ********************************

y_true = valid_labels
y_pred = predictions

target_names = ['False', 'True']

print('Classification Report:')
print('**********************')
print(classification_report(y_true, y_pred, target_names=target_names, digits=5))

Classification Report:
**********************
              precision    recall  f1-score   support

       False    0.60612   0.65453   0.62939      3543
        True    0.61437   0.56407   0.58815      3457

    accuracy                        0.60986      7000
   macro avg    0.61024   0.60930   0.60877      7000
weighted avg    0.61019   0.60986   0.60902      7000



In [20]:
# Create a Decision Tree Classifier
model = DecisionTreeClassifier()

# Fit the model
model.fit(train_features, train_labels)

# Get the array of predicted values from the model
predictions = model.predict(valid_features)

# Generate a classification report
# ********************************

y_true = valid_labels
y_pred = predictions

target_names = ['False', 'True']

print('Classification Report:')
print('**********************')
print(classification_report(y_true, y_pred, target_names=target_names, digits=5))

Classification Report:
**********************
              precision    recall  f1-score   support

       False    0.64481   0.62715   0.63586      3543
        True    0.62831   0.64594   0.63700      3457

    accuracy                        0.63643      7000
   macro avg    0.63656   0.63654   0.63643      7000
weighted avg    0.63666   0.63643   0.63642      7000



In [21]:
# Choosing the Random Forest Classifier

In [22]:
# Create a Random Forest Classifier
forest = RandomForestClassifier(criterion="gini")

# Fit the model
forest.fit(train_features, train_labels)

# Get the array of predicted values from the model
predictions = forest.predict(valid_features)

# Generate a classification report
# ********************************

y_true = valid_labels
y_pred = predictions

target_names = ['False', 'True']

print('Classification Report:')
print('**********************')
print(classification_report(y_true, y_pred, target_names=target_names, digits=5))

Classification Report:
**********************
              precision    recall  f1-score   support

       False    0.71848   0.72537   0.72191      3543
        True    0.71575   0.70871   0.71221      3457

    accuracy                        0.71714      7000
   macro avg    0.71711   0.71704   0.71706      7000
weighted avg    0.71713   0.71714   0.71712      7000



In [23]:
# Saving the Model for Future Use

In [24]:
# Pickle is used to serialize objects
import pickle

filename = "model.sav"
pickle.dump(forest, open(filename, "wb"))

In [25]:
# Getting Statistics

In [26]:
saved_model = pickle.load(open("model.sav", "rb"))
predictions = saved_model.predict(test_features)
y_true = test_labels
y_pred = predictions
target_names = ['False', 'True']
print(classification_report(y_true, y_pred, target_names=target_names, digits=5))

              precision    recall  f1-score   support

       False    0.71356   0.72438   0.71893      3494
        True    0.72111   0.71021   0.71562      3506

    accuracy                        0.71729      7000
   macro avg    0.71734   0.71730   0.71728      7000
weighted avg    0.71734   0.71729   0.71727      7000



In [28]:
test_set

array([[1.4353e+04, 1.0000e+00, 1.5000e+02, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [2.0310e+04, 1.0000e+00, 1.5800e+02, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [1.4455e+04, 1.0000e+00, 1.6400e+02, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       ...,
       [1.8064e+04, 1.0000e+00, 1.8800e+02, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [1.6614e+04, 1.0000e+00, 1.5200e+02, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [1.6661e+04, 2.0000e+00, 1.6800e+02, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00]])

In [29]:
dataframe

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,22431,1,163,72.0,135,80,1,2,0,0,0,1
