In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix,classification_report
import plotly.express as px

In [2]:
cardio_df = pd.read_csv("Data/cardio_data.csv", sep = ";")

In [3]:
cardio_df.head(5)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
cardio_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [5]:
cardio_df["cardio"].value_counts()

0    35021
1    34979
Name: cardio, dtype: int64

In [6]:
cardio_df = cardio_df.drop("id", axis = 1, inplace = False)

In [7]:
cardio_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  int64  
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   ap_hi        70000 non-null  int64  
 5   ap_lo        70000 non-null  int64  
 6   cholesterol  70000 non-null  int64  
 7   gluc         70000 non-null  int64  
 8   smoke        70000 non-null  int64  
 9   alco         70000 non-null  int64  
 10  active       70000 non-null  int64  
 11  cardio       70000 non-null  int64  
dtypes: float64(1), int64(11)
memory usage: 6.4 MB


In [8]:
cardio_df.nunique()

age            8076
gender            2
height          109
weight          287
ap_hi           153
ap_lo           157
cholesterol       3
gluc              3
smoke             2
alco              2
active            2
cardio            2
dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    cardio_df.drop("cardio", axis=1, inplace=False),
    cardio_df["cardio"],
    test_size = 0.2,
    stratify = cardio_df["cardio"],
    random_state = 42
)

In [10]:
train_df = X_train
train_df["cardio"] = y_train

In [11]:
test_df = X_test
test_df['cardio'] = y_test

In [12]:
scaler = StandardScaler()

In [13]:
X_train = pd.DataFrame(data=scaler.fit_transform(X_train), columns=X_train.columns)

In [14]:
X_test = pd.DataFrame(data=scaler.transform(X_test), columns=X_test.columns)

In [15]:
y_train = train_df["cardio"]

In [16]:
y_test = test_df["cardio"]

In [17]:
X_test.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,-0.032755,-0.73236,-1.135796,-1.023103,-0.055968,-0.060952,-0.539408,-0.396073,-0.310319,-0.236614,0.492259,-0.999393
1,0.655774,-0.73236,-0.528034,-1.05785,0.005932,-0.035121,-0.539408,-0.396073,-0.310319,-0.236614,0.492259,-0.999393
2,-1.761997,1.365449,1.295251,0.957492,-0.055968,-0.086783,0.931371,-0.396073,-0.310319,-0.236614,0.492259,-0.999393
3,-0.276888,1.365449,1.538356,-0.849366,-0.055968,-0.035121,-0.539408,-0.396073,-0.310319,-0.236614,0.492259,1.000607
4,-0.526302,-0.73236,0.322832,0.47103,-0.055968,-0.086783,-0.539408,-0.396073,-0.310319,-0.236614,0.492259,1.000607


In [18]:
y_test.head(5)

18682    0
40992    0
38068    0
12096    1
17791    1
Name: cardio, dtype: int64

In [19]:
Ir_model = LogisticRegression()

In [20]:
Ir_model.fit(X_train, y_train)

In [21]:
y_pred_Ir=Ir_model.predict(X_test)

In [31]:
print(classification_report(y_test, y_pred_Ir))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7004
           1       1.00      1.00      1.00      6996

    accuracy                           1.00     14000
   macro avg       1.00      1.00      1.00     14000
weighted avg       1.00      1.00      1.00     14000



In [27]:
DTC_model = DecisionTreeClassifier()

In [28]:
DTC.fit(X_train, y_train)

In [33]:
y_pred_DTC=DTC.predict(X_test)

In [37]:
print(classification_report(y_test, y_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7004
           1       1.00      1.00      1.00      6996

    accuracy                           1.00     14000
   macro avg       1.00      1.00      1.00     14000
weighted avg       1.00      1.00      1.00     14000



In [43]:
fc_matrix_DTC = confusion_matrix(y_test, y_pred_DTC)

In [44]:
model_dtc = DecisionTreeClassifier()
grid_params_dtc = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'splitter' : ['best', 'random']
    'max_depth' : [None, 5, 10, 15, 20],
    'min_samples_split' : [2, 4, 8, 16],
}
grid_dtc = GridSearchCV(model_dtc, grid_params, cv=5, verbose=1, scoring='recall')
grid_dtc.fit(X_train_res, y_train_res)

SyntaxError: invalid syntax (3047123663.py, line 5)

In [0]:
model_mlp = MLPClassifier()
grid_params_mlp = {
    'solver': ['lbfgs','sgd', 'adam'],
    'activation' : ['identity', 'logistic', 'tanh', 'relu'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate' : ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init' : [0.0001, 0.001, 0.01]
}

In [0]:
# Create the tree
tree = DecisionTreeClassifier(max_depth=6, class_weight='balanced')
tree.fit(X_train,y_train)

#create array of probabilities
y_test_predict_proba = tree1.predict_proba(X_test)

# calc confusion matrix
y_test_predict = tree.predict(X_test[columns])
print("Confusion Matrix Tree : \n", confusion_matrix(y_test, y_test_predict),"\n")
print("The precision for Tree is ",precision_score(y_test, y_test_predict)) 
print("The recall for Tree is ",recall_score(y_test, y_test_predict),"\n")  