In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("heart_disease.csv")
print(df.head())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  target  
0   0     1       1  
1   0     2       1  
2   0     2       1  
3   0     2       1  
4   0     2       1  


In [4]:
print(df.columns)

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')


In [6]:
for column in df.columns:
    corr = np.corrcoef(df['target'], df[column])
    print(f"corr of target and {column} = {corr[0, 1]}")

corr of target and age = -0.22543871587483727
corr of target and sex = -0.28093657550176654
corr of target and cp = 0.4337982615068933
corr of target and trestbps = -0.14493112849775144
corr of target and chol = -0.08523910513756902
corr of target and fbs = -0.028045760272712827
corr of target and restecg = 0.13722950287377336
corr of target and thalach = 0.42174093381067435
corr of target and exang = -0.4367570833533018
corr of target and oldpeak = -0.4306960016873683
corr of target and slope = 0.34587707824172526
corr of target and ca = -0.3917239923512519
corr of target and thal = -0.34402926803830985
corr of target and target = 1.0


In [8]:
# decide the x and y
x = df.drop(['age', 'sex', 'trestbps', 'chol', 'fbs', 'restecg', 'target'], axis=1)
y = df['target']

In [10]:
# split the data into test and train
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=123456)

In [16]:
def evaluate_model(model):
    # predict all the x_test values
    y_prediction = model.predict(x_test)
    
    from sklearn.metrics import confusion_matrix, classification_report
    cm = confusion_matrix(y_test, y_prediction)
    print(cm)
    
    print(classification_report(y_test, y_prediction))

In [26]:
def random_forest():
    # create and train the model

    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=50)

    # fit the model with x_train and y_train
    model.fit(x_train, y_train)
    
    evaluate_model(model)

In [27]:
# create and evaluate the model
random_forest()

[[20  4]
 [ 5 32]]
              precision    recall  f1-score   support

           0       0.80      0.83      0.82        24
           1       0.89      0.86      0.88        37

    accuracy                           0.85        61
   macro avg       0.84      0.85      0.85        61
weighted avg       0.85      0.85      0.85        61



In [29]:
def decision_tree():
    # create and train the model

    from sklearn.tree import DecisionTreeClassifier
    model = DecisionTreeClassifier()

    # fit the model with x_train and y_train
    model.fit(x_train, y_train)
    
    evaluate_model(model)

In [30]:
# create and evaluate the model
decision_tree()

[[16  8]
 [10 27]]
              precision    recall  f1-score   support

           0       0.62      0.67      0.64        24
           1       0.77      0.73      0.75        37

    accuracy                           0.70        61
   macro avg       0.69      0.70      0.70        61
weighted avg       0.71      0.70      0.71        61

