### Import necessary modules and libraries

In [66]:
#pip install scikit-fuzzy

In [67]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import skfuzzy as fuzz
from skfuzzy import control as ctrl
import pandas as pd
import time

## Random Forest Classification Implementation

### Read the data

In [68]:
df = pd.read_csv("heart_diagnosis_dataset.csv")

### Get shape, head and describe


In [69]:
df.shape

(1025, 11)

In [70]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,blood_pressure,chol,blood_sugar_ov120,restecg,max_heart_rate,oldpeak_st_depression,thal,diagnosis
0,52,1,0,125,212,0,1,168,1.0,3,0
1,53,1,0,140,203,1,0,155,3.1,3,0
2,70,1,0,145,174,0,1,125,2.6,3,0
3,61,1,0,148,203,0,1,161,0.0,3,0
4,62,0,0,138,294,1,1,106,1.9,2,0


In [71]:
df.describe()

Unnamed: 0,age,sex,chest_pain_type,blood_pressure,chol,blood_sugar_ov120,restecg,max_heart_rate,oldpeak_st_depression,thal,diagnosis
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,1.071512,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,1.175053,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.8,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.8,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,6.2,3.0,1.0


### Drop undesired columns


In [72]:
df.drop(["chest_pain_type", "restecg", "max_heart_rate",
        "oldpeak_st_depression", "thal"], axis=1, inplace=True)

### Dealing with missing data

In [73]:
df.isnull().sum()
# No missing values

age                  0
sex                  0
blood_pressure       0
chol                 0
blood_sugar_ov120    0
diagnosis            0
dtype: int64

### One-Hot Encoding

In [74]:
# Pandas get_dummies function will be used to perform One-Hot Encoding on column "blood_sugar_ov120"
dummy_df = pd.get_dummies(df["blood_sugar_ov120"], prefix="blood_sugar_ov120")
df = pd.concat([df, dummy_df], axis=1)
df = df.drop(["blood_sugar_ov120"], axis=1)

### Re-ordering columns

In [75]:
df = df[["age", "sex", "blood_pressure", "chol",
         "blood_sugar_ov120_0", "blood_sugar_ov120_1", "diagnosis"]]

In [76]:
df

Unnamed: 0,age,sex,blood_pressure,chol,blood_sugar_ov120_0,blood_sugar_ov120_1,diagnosis
0,52,1,125,212,1,0,0
1,53,1,140,203,0,1,0
2,70,1,145,174,1,0,0
3,61,1,148,203,1,0,0
4,62,0,138,294,0,1,0
...,...,...,...,...,...,...,...
1020,57,1,150,126,0,1,1
1021,60,1,125,258,1,0,0
1022,47,1,110,275,1,0,0
1023,57,1,150,126,0,1,1


### Defining inputs and outputs

In [77]:
# "age, sex, blood_pressure, chol, blood_sugar" columns will be our inputs.
# The algorithm will predict "diagnosis" values according to these inputs.
X = df.drop(["diagnosis"], axis=1)
y = df["diagnosis"]

### Train Test Split

In [78]:
X_train, X_test, y_train, y_test, = train_test_split(
    X, y, test_size=0.3, random_state=42)

### Train the machine with Random Forest Classifier and make predictions for "diagnosis"

In [79]:
rfc = RandomForestClassifier()

start_time = time.time()
rfc.fit(X_train, y_train)  # training
rfc_training_time = "%.4f" % (time.time() - start_time)

In [80]:
start_time = time.time()
ypred = rfc.predict(X_test)  # predicting
rfx_predict_time = "%.4f" % (time.time() - start_time)

### Random Forest evaluation and success metrics

In [81]:
def cross_val(model):
    score = cross_val_score(model, X_train, y_train, cv=10).mean()
    return score

### Calculate the accuracy score

In [82]:
start_time = time.time()
rfc_acscore = accuracy_score(y_test, ypred)
rfc_acc_evaluation_time = "%.4f" % (time.time() - start_time)

start_time = time.time()
rfc_cv = cross_val(rfc)
rfc_cv_evaluation_time = "%.4f" % (time.time() - start_time)

### Random Forest Classification Results


In [83]:
print("Random Forest Accuracy Score: ", rfc_acscore)
print("Random Forest Cross Validation Score: ", rfc_cv)
print("Random Forest Training Time (s): ", rfc_training_time)
print("Random Forest Prediction Time (s): ", rfx_predict_time)
print("Random Forest Evaluation Time (s): ", rfc_acc_evaluation_time)
print("Random Forest Cross Validation Evaluation Time (s): ", rfc_cv_evaluation_time)

Random Forest Accuracy Score:  0.9805194805194806
Random Forest Cross Validation Score:  0.963810641627543
Random Forest Training Time (s):  0.1824
Random Forest Prediction Time (s):  0.0282
Random Forest Evaluation Time (s):  0.0011
Random Forest Cross Validation Evaluation Time (s):  1.9522


## Fuzzy Logic Implementation

In [84]:
df.drop(["blood_sugar_ov120_0", "blood_sugar_ov120_1", "sex"],
        axis=1, inplace=True)

In [85]:
df

Unnamed: 0,age,blood_pressure,chol,diagnosis
0,52,125,212,0
1,53,140,203,0
2,70,145,174,0
3,61,148,203,0
4,62,138,294,0
...,...,...,...,...
1020,57,150,126,1
1021,60,125,258,0
1022,47,110,275,0
1023,57,150,126,1


### Memberships

In [86]:
# "age" low: 30-45 / average: 45-55 / high: 55-75
# "bloodpressure (mm Hg)" low: 90-120 / average: 120-140 / high: >140
# "cholesterol (mg/dL)" low: 120-200 / average: 200-240 / high: >240

### Defining variables

### Antecedent (input/sensor) variable for fuzzy control system

In [87]:
age = ctrl.Antecedent(np.arange(30, 76, 1), 'age')
bloodpressure = ctrl.Antecedent(np.arange(90, 181, 1), 'bloodpressure')
cholesterol = ctrl.Antecedent(np.arange(120, 341, 1), 'cholesterol')

### Consequent (output/control) variable for fuzzy control system

In [88]:
risk = ctrl.Consequent(np.arange(0, 101, 1), 'risk')

### Auto membership function to populate the fuzzy variables with terms

In [89]:
age.automf(3, 'quant')
bloodpressure.automf(3, 'quant')
cholesterol.automf(3, 'quant')
risk.automf(3, 'quant')

### Fuzzy control system rules connects antecedents to consequents

In [90]:
risk_rule1 = ctrl.Rule(age['low'] & bloodpressure['low'] & cholesterol['low'], risk['low'])
risk_rule2 = ctrl.Rule(age['low'] & bloodpressure['low'] & cholesterol['average'], risk['low'])
risk_rule3 = ctrl.Rule(age['low'] & bloodpressure['average'] & cholesterol['low'], risk['low'])
risk_rule4 = ctrl.Rule(age['low'] & bloodpressure['average'] & cholesterol['average'], risk['low'])
risk_rule5 = ctrl.Rule(age['low'] & bloodpressure['low'] & cholesterol['average'], risk['average'])
risk_rule6 = ctrl.Rule(age['low'] & bloodpressure['high'] & cholesterol['low'], risk['average'])
risk_rule7 = ctrl.Rule(age['low'] & bloodpressure['low'] & cholesterol['high'], risk['average'])
risk_rule8 = ctrl.Rule(age['low'] & bloodpressure['average'] & cholesterol['average'], risk['high'])
risk_rule9 = ctrl.Rule(age['low'] & bloodpressure['high'] & cholesterol['average'], risk['high'])
risk_rule10 = ctrl.Rule(age['low'] & bloodpressure['high'] & cholesterol['high'], risk['high'])
risk_rule11 = ctrl.Rule(age['low'] & bloodpressure['high'] & cholesterol['high'], risk['high'])
risk_rule12 = ctrl.Rule(age['average'] & bloodpressure['low'] & cholesterol['low'], risk['low'])
risk_rule13 = ctrl.Rule(age['average'] & bloodpressure['low'] & cholesterol['average'], risk['low'])
risk_rule14 = ctrl.Rule(age['average'] & bloodpressure['average'] & cholesterol['low'], risk['low'])
risk_rule15 = ctrl.Rule(age['average'] & bloodpressure['average'] & cholesterol['average'], risk['average'])
risk_rule16 = ctrl.Rule(age['average'] & bloodpressure['low'] & cholesterol['average'], risk['average'])
risk_rule17 = ctrl.Rule(age['average'] & bloodpressure['high'] & cholesterol['low'], risk['average'])
risk_rule18 = ctrl.Rule(age['average'] & bloodpressure['low'] & cholesterol['high'], risk['high'])
risk_rule19 = ctrl.Rule(age['average'] & bloodpressure['average'] & cholesterol['average'], risk['high'])
risk_rule20 = ctrl.Rule(age['average'] & bloodpressure['high'] & cholesterol['average'], risk['high'])
risk_rule21 = ctrl.Rule(age['average'] & bloodpressure['high'] & cholesterol['high'], risk['high'])
risk_rule22 = ctrl.Rule(age['average'] & bloodpressure['high'] & cholesterol['high'], risk['high'])
risk_rule23 = ctrl.Rule(age['average'] & bloodpressure['high'] & cholesterol['low'], risk['high'])
risk_rule24 = ctrl.Rule(age['average'] & bloodpressure['high'] & cholesterol['average'], risk['high'])
risk_rule25 = ctrl.Rule(age['high'] & bloodpressure['low'] & cholesterol['low'], risk['low'])
risk_rule26 = ctrl.Rule(age['high'] & bloodpressure['low'] & cholesterol['average'], risk['average'])
risk_rule27 = ctrl.Rule(age['high'] & bloodpressure['average'] & cholesterol['low'], risk['average'])
risk_rule28 = ctrl.Rule(age['high'] & bloodpressure['average'] & cholesterol['average'], risk['average'])
risk_rule29 = ctrl.Rule(age['high'] & bloodpressure['low'] & cholesterol['average'], risk['high'])
risk_rule30 = ctrl.Rule(age['high'] & bloodpressure['high'] & cholesterol['low'], risk['high'])
risk_rule31 = ctrl.Rule(age['high'] & bloodpressure['low'] & cholesterol['high'], risk['high'])
risk_rule32 = ctrl.Rule(age['high'] & bloodpressure['average'] & cholesterol['average'], risk['high'])
risk_rule33 = ctrl.Rule(age['high'] & bloodpressure['high'] & cholesterol['average'], risk['high'])
risk_rule34 = ctrl.Rule(age['high'] & bloodpressure['high'] & cholesterol['average'], risk['high'])
risk_rule35 = ctrl.Rule(age['high'] & bloodpressure['high'] & cholesterol['high'], risk['high'])
risk_rule36 = ctrl.Rule(age['high'] & bloodpressure['high'] & cholesterol['high'], risk['high'])
risk_rule37 = ctrl.Rule(age['high'] & bloodpressure['high'] & cholesterol['low'], risk['high'])
risk_rule38 = ctrl.Rule(age['high'] & bloodpressure['low'] & cholesterol['low'], risk['high'])

### Base class for containing fuzzy control system

In [91]:
risk_ctrl = ctrl.ControlSystem(
    [risk_rule1, risk_rule2, risk_rule3, risk_rule4, risk_rule5, risk_rule6,
     risk_rule7, risk_rule8, risk_rule9, risk_rule10, risk_rule11, risk_rule12,
     risk_rule13, risk_rule14, risk_rule15, risk_rule16, risk_rule17,
     risk_rule18, risk_rule19, risk_rule20, risk_rule21, risk_rule22,
     risk_rule23, risk_rule24, risk_rule25, risk_rule26, risk_rule27,
     risk_rule28, risk_rule29, risk_rule30, risk_rule31, risk_rule32,
     risk_rule33, risk_rule34, risk_rule35, risk_rule36, risk_rule37,
     risk_rule38])

### Simulates and calculates results from the ControlSystem

In [92]:
risk_sim = ctrl.ControlSystemSimulation(risk_ctrl)

### Creating an empty column in dataset

In [93]:
df["risk"] = pd.Series(dtype=float)

### Implementing fuzzy rules for all the rows in dataset and saving the risk outputs in the newly created "Risk" column

In [94]:
start_time = time.time()
for i in range(len(df)):
    risk_sim.input['age'] = int(df.iloc[i]["age"])
    risk_sim.input['bloodpressure'] = int(df.iloc[i]["blood_pressure"])
    risk_sim.input['cholesterol'] = int(df.iloc[i]["chol"])

    risk_sim.compute()

    result = float(risk_sim.output["risk"])

    df["risk"].values[i] = result
implementation_time = "%.4f" % (time.time() - start_time)

### Fuzzy Logic Results

In [95]:
print("Fuzzy Logic Implementation Time (sec):", implementation_time)

Fuzzy Logic Implementation Time (sec): 1.4306


In [96]:
print("Dataset after risk calculations:")
df

Dataset after risk calculations:


Unnamed: 0,age,blood_pressure,chol,diagnosis,risk
0,52,125,212,0,56.771128
1,53,140,203,0,56.429065
2,70,145,174,0,54.308804
3,61,148,203,0,55.547003
4,62,138,294,0,55.023842
...,...,...,...,...,...
1020,57,150,126,1,42.592105
1021,60,125,258,0,56.140231
1022,47,110,275,0,48.097526
1023,57,150,126,1,42.592105


In [97]:
print("Fuzzy Logic 'Risk' column details:")
df["risk"].describe()

Fuzzy Logic 'Risk' column details:


count    1025.000000
mean       55.128841
std         7.339916
min        33.337867
25%        50.661215
50%        54.808805
75%        56.947607
max        83.125154
Name: risk, dtype: float64