In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")

# Questions

1. Does age have an impact on stroke?
2. Does body mass index (BMI) of a person contribute to the likelihood of having a heart stroke?
3. Does glucose levels in a person influence heart stroke?
4. Is smoking a supporting factor to inducing a stroke?
5. Are high stress levels causing higher chances of a heart stroke?
6. Does marriage or workload contribute to high blood pressure?

# Dataset description

In this section we will load the data, visualize missing values, and extract any unnecessary features. The dataset contains 11 features and one binary target.

- id: identification no. of the person.
- gender: gender of the person.
- age: age of the person.
- hypertension: does the person have hypertension?
- heart_disease: does person have heart disease?
- ever_married: marital status of the person
- work_type: work status of the person
- Residence_type: residence type of the person.
- avg_glucose_level: average glucose level of the person.
- bmi: body mass index of the person
- smoking_status: smoking status of the person
- stroke: did the person suffer from stroke

# Data Cleaning

In [2]:
#reading the dataset
df=pd.read_csv("healthcare-dataset-stroke-data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


We see that the ID column is similar to the row ID of the dataset hence we can drop this without losing any meaningful data.

In [3]:
#dropping id column
df=df.drop("id",axis=1)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
print (f' We have {df.shape[0]} instances with {df.shape[1]-1} features and 1 target variable')

 We have 5110 instances with 10 features and 1 target variable


In [5]:
df_cols=list(df.columns)
print(df_cols)

['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


In [7]:
df.nunique()

gender                  3
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3979
bmi                   418
smoking_status          4
stroke                  2
dtype: int64

In [8]:
#changing data type of categorical values
categorical = df.select_dtypes(exclude="float64").columns
df[categorical]=df[categorical].astype("category")
df.dtypes

gender               category
age                   float64
hypertension         category
heart_disease        category
ever_married         category
work_type            category
Residence_type       category
avg_glucose_level     float64
bmi                   float64
smoking_status       category
stroke               category
dtype: object

In [9]:
smoking_status_val=list(df['smoking_status'].unique())
work_type_val=list(df['work_type'].unique())
married_val=list(df['ever_married'].unique())
print(smoking_status_val)
print(work_type_val)
print(married_val)

['formerly smoked', 'never smoked', 'smokes', 'Unknown']
['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked']
['Yes', 'No']


In [10]:
new_smoking=["Ex-Smoker","Never Smoked","Current Smoker","Unknown"]
new_work_type=["Private","Self-Employed","Government","Child","Unemployed"]
new_married=[1,0]

In [11]:
smoking_rename = dict(zip(smoking_status_val, new_smoking))
work_rename = dict(zip(work_type_val, new_work_type))
married_rename = dict(zip(married_val, new_married))
print(smoking_rename)
print(work_rename)
print(married_rename)

{'formerly smoked': 'Ex-Smoker', 'never smoked': 'Never Smoked', 'smokes': 'Current Smoker', 'Unknown': 'Unknown'}
{'Private': 'Private', 'Self-employed': 'Self-Employed', 'Govt_job': 'Government', 'children': 'Child', 'Never_worked': 'Unemployed'}
{'Yes': 1, 'No': 0}


In [12]:
#tidying category values
df["smoking_status"]=df["smoking_status"].cat.rename_categories(smoking_rename)
df["work_type"]=df["work_type"].cat.rename_categories(work_rename)
df["ever_married"]=df["ever_married"].cat.rename_categories(married_rename)

In [13]:
#turning binary categories into boolean values
boolean_cols=["hypertension","heart_disease","ever_married","stroke"]
df[boolean_cols]=df[boolean_cols].astype("bool_")
#turning age into integers
df["age"]=df["age"].astype("int64")
df.dtypes

gender               category
age                     int64
hypertension             bool
heart_disease            bool
ever_married             bool
work_type            category
Residence_type       category
avg_glucose_level     float64
bmi                   float64
smoking_status       category
stroke                   bool
dtype: object

In [14]:
def missing(df):
    missing_number=df.isnull().sum().sort_values(ascending=False)
    missing_percent=(df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['No. Of Missing Instances', 'Percentage Missing'])
    return missing_values

In [15]:
missing(df)

Unnamed: 0,No. Of Missing Instances,Percentage Missing
bmi,201,0.039335
gender,0,0.0
age,0,0.0
hypertension,0,0.0
heart_disease,0,0.0
ever_married,0,0.0
work_type,0,0.0
Residence_type,0,0.0
avg_glucose_level,0,0.0
smoking_status,0,0.0


In [16]:
#given that only one feature has missing values, 
#and we have a small dataset we will impute it with the median
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer = imputer.fit(df[['bmi']])
df['bmi'] = imputer.transform(df[['bmi']])
missing(df)

Unnamed: 0,No. Of Missing Instances,Percentage Missing
gender,0,0.0
age,0,0.0
hypertension,0,0.0
heart_disease,0,0.0
ever_married,0,0.0
work_type,0,0.0
Residence_type,0,0.0
avg_glucose_level,0,0.0
bmi,0,0.0
smoking_status,0,0.0


In [17]:
#tidying column headers
tidycols=['Gender', 'Age', 'Hypertension', 'Heart Disease', 
          'Marital Status', 'Work Type', 'Residence Type', 'Average Glucose Level', 
          'BMI', 'Smoking Status', 'Stroke']

df.columns=[tidycols]
df

Unnamed: 0,Gender,Age,Hypertension,Heart Disease,Marital Status,Work Type,Residence Type,Average Glucose Level,BMI,Smoking Status,Stroke
0,Male,67,False,True,True,Private,Urban,228.69,36.6,Ex-Smoker,True
1,Female,61,False,False,True,Self-Employed,Rural,202.21,28.1,Never Smoked,True
2,Male,80,False,True,True,Private,Rural,105.92,32.5,Never Smoked,True
3,Female,49,False,False,True,Private,Urban,171.23,34.4,Current Smoker,True
4,Female,79,True,False,True,Self-Employed,Rural,174.12,24.0,Never Smoked,True
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80,True,False,True,Private,Urban,83.75,28.1,Never Smoked,False
5106,Female,81,False,False,True,Self-Employed,Urban,125.20,40.0,Never Smoked,False
5107,Female,35,False,False,True,Self-Employed,Rural,82.99,30.6,Never Smoked,False
5108,Male,51,False,False,True,Private,Rural,166.29,25.6,Ex-Smoker,False


In [18]:
df.to_csv("stroke_dataset_clean.csv",index=False)