In [1]:
import pandas as pd
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder

In [17]:
# Loading the dataset
df = pd.read_csv('../healthcare_dataset.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [18]:
# Filternig and handling missing or unwanted values
df = df.drop(columns=['id'])
df['bmi'] = df['bmi'].fillna(df['bmi'].median())
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.1,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [19]:
# Label encoding categorical features
data_categorical = df.select_dtypes(exclude=['number'])

encoder = LabelEncoder()
for col in data_categorical.columns.tolist():
    df[col] = encoder.fit_transform(df[col])

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,0,61.0,0,0,1,3,0,202.21,28.1,2,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1


In [20]:
# Defining the features and target variables
x = df.drop(columns=['stroke'])
y = df['stroke']

In [26]:
# Calculating anova test
f_values, p_values = f_classif(x, y)
anova_results = pd.DataFrame({'Features': x.columns, 'F values': f_values, 'P values': p_values})
anova_results.sort_values(by=['F values','P values'], ascending=[False, True])

Unnamed: 0,Features,F values,P values
1,age,326.916568,7.030778e-71
3,heart_disease,94.698406,3.451927e-22
7,avg_glucose_level,90.50387,2.7678109999999998e-21
2,hypertension,84.953542,4.3675569999999995e-20
4,ever_married,60.66723,8.128659e-15
8,bmi,6.669168,0.009837071
5,work_type,5.340019,0.02088088
9,smoking_status,4.043033,0.04440649
6,Residence_type,1.220842,0.2692476
0,gender,0.407266,0.5233891


In [33]:
top_res = anova_results.head()
print("Anova test top 5 features are :\n", top_res.to_string(index=False))

Anova test top 5 features are :
      Features   F values     P values
       gender   0.407266 5.233891e-01
          age 326.916568 7.030778e-71
 hypertension  84.953542 4.367557e-20
heart_disease  94.698406 3.451927e-22
 ever_married  60.667230 8.128659e-15
