# Import Library

In [60]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Load Dataset

In [35]:
df = pd.read_csv("./data/stroke_dataset.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# Data Exploration

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [37]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,5110.0,36517.829354,21161.721625,67.0,17741.25,36932.0,54682.0,72940.0
age,5110.0,43.226614,22.612647,0.08,25.0,45.0,61.0,82.0
hypertension,5110.0,0.097456,0.296607,0.0,0.0,0.0,0.0,1.0
heart_disease,5110.0,0.054012,0.226063,0.0,0.0,0.0,0.0,1.0
avg_glucose_level,5110.0,106.147677,45.28356,55.12,77.245,91.885,114.09,271.74
bmi,4909.0,28.893237,7.854067,10.3,23.5,28.1,33.1,97.6
stroke,5110.0,0.048728,0.21532,0.0,0.0,0.0,0.0,1.0


In [38]:
df["work_type"].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [39]:
df["Residence_type"].unique()

array(['Urban', 'Rural'], dtype=object)

In [40]:
df["smoking_status"].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [41]:
df.groupby('gender')['age'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Female,2994.0,43.757395,21.966561,0.08,27.0,44.0,61.0,82.0
Male,2115.0,42.483385,23.484066,0.08,22.0,46.0,61.0,82.0
Other,1.0,26.0,,26.0,26.0,26.0,26.0,26.0


# Data Cleaning

In [43]:
df.drop("id", axis = 1, inplace=True)

In [44]:
df = df.drop(df[df['gender'] == 'Other'].index)

In [45]:
df.duplicated().sum()

0

In [46]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [47]:
df['bmi'].fillna(df['bmi'].mean(),inplace=True)
df.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

# Data Pre-processing

Label Encoding

In [None]:
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df['ever_married'] = label_encoder.fit_transform(df['ever_married'])
df['work_type'] = label_encoder.fit_transform(df['work_type'])
df['Residence_type'] = label_encoder.fit_transform(df['Residence_type'])
df['smoking_status'] = label_encoder.fit_transform(df['smoking_status'])

In [52]:
df.sample(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
4026,0,64.0,0,0,1,0,1,76.12,38.2,1,0
4056,1,9.0,0,0,0,4,1,69.52,24.2,0,0
4424,0,7.0,0,0,0,4,0,69.47,18.9,0,0
4760,1,49.0,0,0,1,2,0,62.64,27.0,2,0
796,0,29.0,0,0,0,2,1,60.26,20.4,2,0
1197,0,14.0,0,0,0,2,0,84.46,21.8,0,0
2414,0,52.0,1,0,1,2,1,155.86,27.2,3,0
4206,0,12.0,0,0,0,4,0,111.08,23.2,2,0
464,1,25.0,0,0,1,2,0,95.59,25.1,2,0
3697,0,59.0,1,0,1,2,0,78.28,31.0,1,0


Scaling

In [53]:
df.nunique()

gender                  2
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3978
bmi                   419
smoking_status          4
stroke                  2
dtype: int64

In [57]:
scaler = MinMaxScaler()

for col_name in df.select_dtypes(include=["float64", "int64"]).columns:
    if df[col_name].nunique() > 5:
        df[col_name] = scaler.fit_transform(df[[col_name]])

In [58]:
df.head(5)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,0.816895,0,1,1,2,1,0.801265,0.30126,1,1
1,0,0.743652,0,0,1,3,0,0.679023,0.212996,2,1
2,1,0.975586,0,1,1,2,0,0.234512,0.254296,2,1
3,0,0.597168,0,0,1,2,1,0.536008,0.27606,3,1
4,0,0.963379,1,0,1,3,0,0.549349,0.15693,2,1


Feature and Target Separation

In [None]:
X = df.drop("stroke",axis=1)
y = df['stroke']

Train-Test Split

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)