# Import Library

In [84]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Load Dataset

In [85]:
df = pd.read_csv("./data/stroke_dataset.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# Data Exploration

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [87]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,5110.0,36517.829354,21161.721625,67.0,17741.25,36932.0,54682.0,72940.0
age,5110.0,43.226614,22.612647,0.08,25.0,45.0,61.0,82.0
hypertension,5110.0,0.097456,0.296607,0.0,0.0,0.0,0.0,1.0
heart_disease,5110.0,0.054012,0.226063,0.0,0.0,0.0,0.0,1.0
avg_glucose_level,5110.0,106.147677,45.28356,55.12,77.245,91.885,114.09,271.74
bmi,4909.0,28.893237,7.854067,10.3,23.5,28.1,33.1,97.6
stroke,5110.0,0.048728,0.21532,0.0,0.0,0.0,0.0,1.0


In [88]:
df["work_type"].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [89]:
df["Residence_type"].unique()

array(['Urban', 'Rural'], dtype=object)

In [90]:
df["smoking_status"].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [91]:
df.groupby('gender')['age'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Female,2994.0,43.757395,21.966561,0.08,27.0,44.0,61.0,82.0
Male,2115.0,42.483385,23.484066,0.08,22.0,46.0,61.0,82.0
Other,1.0,26.0,,26.0,26.0,26.0,26.0,26.0


# Data Cleaning

In [92]:
df.drop("id", axis = 1, inplace=True)

In [93]:
df = df.drop(df[df['gender'] == 'Other'].index)

In [94]:
df.duplicated().sum()

0

In [95]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [96]:
df['bmi'].fillna(df['bmi'].mean(),inplace=True)
df.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

# Data Pre-processing

Label Encoding

In [97]:
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df['ever_married'] = label_encoder.fit_transform(df['ever_married'])
df['work_type'] = label_encoder.fit_transform(df['work_type'])
df['Residence_type'] = label_encoder.fit_transform(df['Residence_type'])
df['smoking_status'] = label_encoder.fit_transform(df['smoking_status'])

In [98]:
df.sample(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
4121,1,79.0,0,0,1,3,1,113.41,35.0,2,0
3684,0,7.0,0,0,0,4,0,89.38,19.0,0,0
892,0,27.0,0,0,0,2,0,104.33,20.1,2,0
252,0,70.0,0,0,1,2,0,69.04,35.9,1,0
4632,0,53.0,0,0,1,2,0,102.0,32.4,2,0
3065,0,61.0,0,0,1,2,1,125.74,32.6,0,0
5049,0,41.0,0,0,1,2,0,91.04,24.5,2,0
586,0,42.0,0,0,0,2,0,139.77,27.7,0,0
628,0,20.0,0,0,0,0,0,73.0,20.8,2,0
4810,1,53.0,0,0,1,2,1,142.64,27.8,3,0


Scaling

In [99]:
df.nunique()

gender                  2
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3978
bmi                   419
smoking_status          4
stroke                  2
dtype: int64

In [100]:
scaler = MinMaxScaler()

for col_name in df.select_dtypes(include=["float64", "int64"]).columns:
    if df[col_name].nunique() > 5:
        df[col_name] = scaler.fit_transform(df[[col_name]])

In [101]:
df.head(5)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,0.816895,0,1,1,2,1,0.801265,0.30126,1,1
1,0,0.743652,0,0,1,3,0,0.679023,0.212996,2,1
2,1,0.975586,0,1,1,2,0,0.234512,0.254296,2,1
3,0,0.597168,0,0,1,2,1,0.536008,0.27606,3,1
4,0,0.963379,1,0,1,3,0,0.549349,0.15693,2,1


Feature and Target Separation

In [102]:
X = df.drop("stroke",axis=1)
y = df['stroke']

Train-Test Split

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [105]:
X_train

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
802,1,0.963379,0,0,1,3,0,0.265534,0.208477,1
3927,0,0.755859,0,0,1,2,1,0.153264,0.297824,0
2337,0,0.255371,0,0,0,2,0,0.020312,0.268041,2
3910,1,0.377441,0,0,1,0,1,0.048841,0.230241,1
1886,0,0.377441,0,0,0,2,0,0.020820,0.109966,2
...,...,...,...,...,...,...,...,...,...,...
4427,0,0.743652,0,0,1,2,0,0.410950,0.223368,2
466,0,0.743652,1,0,1,2,0,0.530560,0.571592,3
3092,0,0.013184,0,0,0,4,1,0.194627,0.085911,0
3773,0,0.462891,0,0,1,0,0,0.179162,0.128293,2


In [104]:
df.to_csv("./data/data_cleaned.csv", index=False)