In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, classification_report

import plotly.express as px

In [4]:
cardio_df = pd.read_csv("Data/cardio_data.csv", sep = ";")

In [5]:
cardio_df.head(5)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [6]:
cardio_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [7]:
cardio_df["cardio"].value_counts()

0    35021
1    34979
Name: cardio, dtype: int64

In [8]:
cardio_df=cardio_df.drop("id", axis=1, inplace = False)

In [9]:
cardio_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  int64  
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   ap_hi        70000 non-null  int64  
 5   ap_lo        70000 non-null  int64  
 6   cholesterol  70000 non-null  int64  
 7   gluc         70000 non-null  int64  
 8   smoke        70000 non-null  int64  
 9   alco         70000 non-null  int64  
 10  active       70000 non-null  int64  
 11  cardio       70000 non-null  int64  
dtypes: float64(1), int64(11)
memory usage: 6.4 MB


In [10]:
cardio_df.nunique()

age            8076
gender            2
height          109
weight          287
ap_hi           153
ap_lo           157
cholesterol       3
gluc              3
smoke             2
alco              2
active            2
cardio            2
dtype: int64

In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    cardio_df.drop("cardio", axis=1, inplace=False),
    cardio_df["cardio"],
    test_size=0.2,
    stratify = cardio_df["cardio"],
    random_state=42
)

In [44]:
train_df = X_train
train_df["cardio"] = y_train

In [45]:
test_df = X_test
test_df["cardio"] = y_test

In [46]:
scaler = StandardScaler()

In [47]:
X_train = pd.DataFrame(data=scaler.fit_transform(X_train), columns=X_train.columns)

In [48]:
X_test = pd.DataFrame(data=scaler.transform(X_test), columns=X_test.columns)

In [49]:
y_train=train_df["cardio"]

In [50]:
y_test = test_df["cardio"]

In [33]:
X_test.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,-0.032755,-0.73236,-1.135796,-1.023103,-0.055968,-0.060952,-0.539408,-0.396073,-0.310319,-0.236614,0.492259
1,0.655774,-0.73236,-0.528034,-1.05785,0.005932,-0.035121,-0.539408,-0.396073,-0.310319,-0.236614,0.492259
2,-1.761997,1.365449,1.295251,0.957492,-0.055968,-0.086783,0.931371,-0.396073,-0.310319,-0.236614,0.492259
3,-0.276888,1.365449,1.538356,-0.849366,-0.055968,-0.035121,-0.539408,-0.396073,-0.310319,-0.236614,0.492259
4,-0.526302,-0.73236,0.322832,0.47103,-0.055968,-0.086783,-0.539408,-0.396073,-0.310319,-0.236614,0.492259


In [34]:
y_test.head(5)

0   -0.999393
1   -0.999393
2   -0.999393
3    1.000607
4    1.000607
Name: cardio, dtype: float64

In [51]:
lr_model = LogisticRegression()

In [52]:
lr_model.fit(X_train, y_train)

In [53]:
y_pred_lr = lr_model.predict(X_test)

In [55]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7004
           1       1.00      1.00      1.00      6996

    accuracy                           1.00     14000
   macro avg       1.00      1.00      1.00     14000
weighted avg       1.00      1.00      1.00     14000



In [62]:
cf_matrix_lr = confusion_matrix(y_test, y_pred_lr)

In [63]:
fig = px.imshow(cf_matrix_lr, text_auto=True)
fig.show()

In [65]:
DTree=DecisionTreeClassifier()

In [66]:
DTree.fit(X_train, y_train)

In [71]:
SGDC=SGDClassifier()

In [72]:
SGDC.fit(X_train, y_train)