In [1]:
import pandas as pd
import numpy as np

In [2]:
stroke = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [3]:
stroke["stroke"].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

In [4]:
from sklearn.utils import resample

stroke_class_0 = stroke[stroke['stroke'] == 0]
stroke_class_1 = stroke[stroke['stroke'] == 1]

stroke_class_1_upsampled = resample(stroke_class_1,
                                    replace=True, 
                                    n_samples=len(stroke_class_0), 
                                    random_state=42)  

stroke_upsampled = pd.concat([stroke_class_0, stroke_class_1_upsampled])
stroke = stroke_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)

stroke.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,40513,Female,21.0,0,0,No,Private,Urban,90.16,28.9,smokes,0
1,13861,Female,52.0,1,0,Yes,Self-employed,Urban,233.29,48.9,never smoked,1
2,12857,Male,55.0,0,0,Yes,Self-employed,Rural,73.57,28.0,smokes,0
3,38673,Female,51.0,0,0,Yes,Private,Rural,105.63,32.8,never smoked,0
4,71673,Female,79.0,0,0,Yes,Private,Urban,110.85,24.1,formerly smoked,1


In [5]:
stroke.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9722 entries, 0 to 9721
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 9722 non-null   int64  
 1   gender             9722 non-null   object 
 2   age                9722 non-null   float64
 3   hypertension       9722 non-null   int64  
 4   heart_disease      9722 non-null   int64  
 5   ever_married       9722 non-null   object 
 6   work_type          9722 non-null   object 
 7   Residence_type     9722 non-null   object 
 8   avg_glucose_level  9722 non-null   float64
 9   bmi                8750 non-null   float64
 10  smoking_status     9722 non-null   object 
 11  stroke             9722 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 911.6+ KB


In [6]:
stroke = stroke.set_index("id")

In [7]:
stroke.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,9722.0,9722.0,9722.0,9722.0,8750.0,9722.0
mean,54.916692,0.174861,0.122094,118.379963,29.506651,0.5
std,22.238642,0.379868,0.327411,55.194973,7.187328,0.500026
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,41.0,0.0,0.0,78.1125,24.7,0.0
50%,59.0,0.0,0.0,96.715,28.6,0.5
75%,74.0,0.0,0.0,144.9,33.2,1.0
max,82.0,1.0,1.0,271.74,97.6,1.0


In [8]:
stroke["stroke"].value_counts()

stroke
0    4861
1    4861
Name: count, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(stroke, test_size=0.2, random_state=42)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

In [11]:
from sklearn.preprocessing import OneHotEncoder

In [12]:
cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("cat_encoder", OneHotEncoder(sparse=False, handle_unknown='ignore')),
    ])

In [13]:
from sklearn.compose import ColumnTransformer

num_attribs = ["age","avg_glucose_level","bmi"]
cat_attribs = ["gender","hypertension","heart_disease","Residence_type","smoking_status", "ever_married", "work_type"]

preprocess_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

In [14]:
X_train = preprocess_pipeline.fit_transform(train_data[num_attribs + cat_attribs])
X_train



array([[ 0.58948836, -0.70070967, -1.07163301, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.17189936, -0.48434886,  0.29916428, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.12709852, -0.86840739, -0.9226333 , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.12732519, -1.00679871,  1.26766236, ...,  0.        ,
         0.        ,  0.        ],
       [-0.66493536, -0.87274908, -0.2670346 , ...,  0.        ,
         1.        ,  0.        ],
       [-1.42654975, -0.44455005,  0.26936434, ...,  1.        ,
         0.        ,  0.        ]])

In [15]:
y_train = train_data["stroke"].copy()

In [16]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)

In [17]:
from sklearn.metrics import mean_squared_error

X_test = preprocess_pipeline.transform(test_data[num_attribs + cat_attribs])
y_test = test_data["stroke"].copy()
y_pred = forest_clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmse

0.08781846196678006

In [18]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.9880431294313043

In [19]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

forest_scores = cross_val_score(forest_clf, X_train, y_train,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [0.12926528 0.11337315 0.1478205  0.10140403 0.10755521 0.09485478
 0.11337315 0.1076244  0.0717496  0.08787496]
Mean: 0.1074895040309519
Standard deviation: 0.020071798421559962
