# Stroke Data Preprocessing

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Read in dataset csv
stroke_df = pd.read_csv('data/healthcare-dataset-stroke-data.csv')

## Data Preprocessing

### Deal with nulls

In the output of the previous cell, we see that all features except `bmi` are free of nulls.

We're going to relace these values with the median value for `bmi`.

In [3]:
print(stroke_df.isnull().sum())
print(stroke_df.shape)
print('Maximum percentage of rows with null values in dataset: ' + str(round(100 * stroke_df.isnull().sum().sum() / len(stroke_df), 3)) + '%')

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64
(5110, 12)
Maximum percentage of rows with null values in dataset: 3.933%


In [4]:
# Replace null values with the median for the `bmi` feature
stroke_df['bmi'] = stroke_df['bmi'].replace(np.NaN, stroke_df['bmi'].median())
print(stroke_df.isnull().sum())
print(stroke_df.shape)

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64
(5110, 12)


### One hot encoding

Splits categorical features out into their continuous equivalent.

In [5]:
str_cols = list(stroke_df.dtypes[stroke_df.dtypes == 'object'].index)

stroke_df = pd.get_dummies(stroke_df, columns=str_cols)

### Split data

Data split into training and testing sets, with a ratio of 2/3 to 1/3 respectively.

In [6]:
x = stroke_df.drop(columns=['stroke'],axis=1, inplace=False)
y = stroke_df['stroke']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [8]:
# Write preprocessed data to CSV
x_train.to_csv('data/x_train.csv', index=False)
x_test.to_csv('data/x_test.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)