In [19]:
# Capstone Project: Stroke Classification Modeling #
# Student Name: Taesun Yoo
##################
# Data Wrangling #
##################
# import libraries:
import pandas as pd
import numpy as np

# Read train data:
df = pd.read_csv('stroke_train.csv')
# Check dataframe info.:
df.info()

# Print the dataframe head:
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43400 entries, 0 to 43399
Data columns (total 12 columns):
id                   43400 non-null int64
gender               43400 non-null object
age                  43400 non-null float64
hypertension         43400 non-null int64
heart_disease        43400 non-null int64
ever_married         43400 non-null object
work_type            43400 non-null object
Residence_type       43400 non-null object
avg_glucose_level    43400 non-null float64
bmi                  41938 non-null float64
smoking_status       30108 non-null object
stroke               43400 non-null int64
dtypes: float64(3), int64(4), object(5)
memory usage: 4.0+ MB
      id  gender   age  hypertension  heart_disease ever_married  \
0  30669    Male   3.0             0              0           No   
1  30468    Male  58.0             1              0          Yes   
2  16523  Female   8.0             0              0           No   
3  56543  Female  70.0             0      

In [20]:
# Check number of missing values:
df.isnull().sum()

# Compute missing value stat.: pre-data cleansing
df_missing = df.isnull().sum()
df_missing = pd.DataFrame(df_missing)
df_missing.columns = ['missing_value_counts']
df_missing['missing_PCT'] = (df_missing['missing_value_counts']/43400)*100

# sort values by missing percent(%):
df_missing = df_missing.sort_values(by='missing_PCT',ascending=False)
df_missing

# sort values by missing percent(%):
df_missing = df_missing.sort_values(by='missing_PCT',ascending=False)
df_missing

Unnamed: 0,missing_value_counts,missing_PCT
smoking_status,13292,30.626728
bmi,1462,3.368664
id,0,0.0
gender,0,0.0
age,0,0.0
hypertension,0,0.0
heart_disease,0,0.0
ever_married,0,0.0
work_type,0,0.0
Residence_type,0,0.0


In [21]:
# Compute summary stat.: pre-data cleansing 
df_summary_pre = df.describe().T
# Reorder columns order in summary dataframe:
df_summary_pre = df_summary_pre[['count','min','max','mean','25%','50%','75%','std']]
# Keep only numerical (scale) features:
df_summary_pre = df_summary_pre.drop(['id','heart_disease','hypertension','stroke'])
df_summary_pre

Unnamed: 0,count,min,max,mean,25%,50%,75%,std
age,43400.0,0.08,82.0,42.217894,24.0,44.0,60.0,22.519649
avg_glucose_level,43400.0,55.0,291.05,104.48275,77.54,91.58,112.07,43.111751
bmi,41938.0,10.1,97.6,28.605038,23.2,27.7,32.9,7.77002


In [22]:
# Handling outliers #
###############################################################################
# Compute IQR, LowerBound and UpperBound for age:
age_IQR = df_summary_pre['75%']['age'] - df_summary_pre['25%']['age']
age_LB =  df_summary_pre['25%']['age'] - 1.5*age_IQR
age_UB =  df_summary_pre['75%']['age'] + 1.5*age_IQR

# Compute IQR, LowerBound and UpperBound for bmi:
bmi_IQR = df_summary_pre['75%']['bmi'] - df_summary_pre['25%']['bmi']
bmi_LB =  df_summary_pre['25%']['bmi'] - 1.5*bmi_IQR
bmi_UB =  df_summary_pre['75%']['bmi'] + 1.5*bmi_IQR

# Compute IQR, LowerBound and UpperBound for avg_glucose_level:
glucose_IQR = df_summary_pre['75%']['avg_glucose_level'] - df_summary_pre['25%']['avg_glucose_level']
glucose_LB =  df_summary_pre['25%']['avg_glucose_level'] - 1.5*glucose_IQR
glucose_UB =  df_summary_pre['75%']['avg_glucose_level'] + 1.5*glucose_IQR

# Create pd series of age, bmi and glucose_level: min, max, LB and UB values
###############################################################################
out_age = pd.DataFrame([df_summary_pre['min']['age'], df_summary_pre['max']['age'], age_LB, age_UB])
out_bmi = pd.DataFrame([df_summary_pre['min']['bmi'], df_summary_pre['max']['bmi'], bmi_LB, bmi_UB])
out_glucose = pd.DataFrame([df_summary_pre['min']['avg_glucose_level'], df_summary_pre['max']['avg_glucose_level'], glucose_LB, glucose_UB])

# Create a dataframe for detecting outliers:
df_outliers = pd.concat([out_age,out_bmi,out_glucose], axis=1)
df_outliers.columns = ['age','bmi','avg_glucose_level']
df_outliers.index = ['min','max','LB','UB']
df_outliers.T

Unnamed: 0,min,max,LB,UB
age,0.08,82.0,-30.0,114.0
bmi,10.1,97.6,8.65,47.45
avg_glucose_level,55.0,291.05,25.745,163.865


In [23]:
# Based on above results, outliers exist beyond UB values of bmi & avg_glucose_level
# Test outliers removal: X > upperbound of bmi values: removes 880 rows
df.drop(df[df['bmi'] > bmi_UB].index)
# Test outliers removal: X > upperbound of avg_glucose_level values: removes 4978 rows
df.drop(df[df['avg_glucose_level'] > glucose_UB].index)

# Remove outliers using "bmi" upperbound:
df = df.drop(df[df['bmi'] > bmi_UB].index)

# Drop meaningless feature 'id'
df = df.drop(['id'], axis=1)

In [24]:
# Ordinal feature encoding #
# Define manual map: smoking_status
smoking_status_map = {'never smoked': 0,
                      'formerly smoked': 1,
                      'smokes': 2}

# Encode features with smoking_status_map:
df['smoking_status'] = df['smoking_status'].map(smoking_status_map)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,0.0,0
2,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,1.0,0
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [25]:
# Nominal feature encoding #
# Define manual map for hypertension and heart_disease:
hypertension_map = {0: 'No', 1: 'Yes'}
heart_disease_map = {0: 'No', 1: 'Yes'}

# Encode binary flag (0/1) as 'Yes or No' for consistency:
df['hypertension'] = df['hypertension'].map(hypertension_map)
df['heart_disease'] = df['heart_disease'].map(heart_disease_map)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,No,No,No,children,Rural,95.12,18.0,,0
1,Male,58.0,Yes,No,Yes,Private,Urban,87.96,39.2,0.0,0
2,Female,8.0,No,No,No,Private,Urban,110.89,17.6,,0
3,Female,70.0,No,No,Yes,Private,Rural,69.04,35.9,1.0,0
4,Male,14.0,No,No,No,Never_worked,Rural,161.28,19.1,,0


In [26]:
# Split inputs and output label #
# take label as a series
Y = df['stroke']
Y.columns = ['stroke']

# take inputs
# drop 'stroke' from df1 and take an array of values:
X = df.drop(['stroke'], axis=1).values

# Feature Imputation #
# import libraries
from sklearn.preprocessing import Imputer
###############################################################################
# Imputation: bmi (scale feature) by median
# Column index = 8
imputer = Imputer(missing_values = 'NaN', strategy = 'median', axis=0)
imputer = imputer.fit(X[:, 8:9])
X[:,8:9] = imputer.transform(X[:, 8:9])

# Imputation: smoking_status (ordinal feature) by mode
# Column index = 9
imputer = Imputer(missing_values = 'NaN', strategy = 'most_frequent', axis=0)
imputer = imputer.fit(X[:, 9:10])
X[:,9:10] = imputer.transform(X[:, 9:10])

# Prepare two dataframes: cleansed and copy 
###############################################################################
# Column indexes 
columnX = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status']

# Row indexes
idx = Y.index

# Cleansed dataframe:
X1 = pd.DataFrame(X, columns=columnX, index=idx)
# Concatenated X1 cleaned inputs and output label as new dataframe:
df_cleaned = pd.concat([X1, Y], axis=1)
# Check any missing values on a new dataframe:
df_cleaned.isnull().sum()

# Convert data types for cleaned dataframe:
df_cleaned['gender'] = df_cleaned['gender'].astype('category')
df_cleaned['age'] = df_cleaned['age'].astype(float)
df_cleaned['hypertension'] = df_cleaned['hypertension'].astype('category')
df_cleaned['heart_disease'] = df_cleaned['heart_disease'].astype('category')
df_cleaned['ever_married'] = df_cleaned['ever_married'].astype('category')
df_cleaned['work_type'] = df_cleaned['work_type'].astype('category')
df_cleaned['Residence_type'] = df_cleaned['Residence_type'].astype('category')
df_cleaned['avg_glucose_level'] = df_cleaned['avg_glucose_level'].astype(float)
df_cleaned['bmi'] = df_cleaned['bmi'].astype(float)

# Make a new copy for dummy variable encoding:
df2 = df_cleaned.copy()
# Convert smoking status of new data frame:
df2['smoking_status'] = df2['smoking_status'].astype(int)

# Convert smoking_status to original string #
inv_smoking_status_map = {v: k for k, v in smoking_status_map.items()}
df_cleaned['smoking_status'] = df_cleaned['smoking_status'].map(inv_smoking_status_map)

In [27]:
# Write cleansed dataframe as .csv file for exploratory data analysis
df_cleaned.to_csv('stroke_train_cleaned.csv')

# Print cleaned dataframe:
print(df_cleaned.head)

<bound method NDFrame.head of        gender    age hypertension heart_disease ever_married      work_type  \
0        Male   3.00           No            No           No       children   
1        Male  58.00          Yes            No          Yes        Private   
2      Female   8.00           No            No           No        Private   
3      Female  70.00           No            No          Yes        Private   
4        Male  14.00           No            No           No   Never_worked   
6      Female  52.00           No            No          Yes        Private   
7      Female  75.00           No           Yes          Yes  Self-employed   
8      Female  32.00           No            No          Yes        Private   
10     Female  79.00           No            No          Yes       Govt_job   
11       Male  79.00           No           Yes          Yes        Private   
12     Female  37.00           No            No          Yes        Private   
13     Female  37.00  

In [28]:
###############################
# Preparation for ML modeling #
###############################

# Dummy variable encoding #
# Perform dummy variable encoding on nominal features
# Remove first dummy on each converted nominal features to avoid dummy variable trap!
df2 = pd.get_dummies(df2[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
                         'work_type', 'Residence_type', 'avg_glucose_level', 'bmi','smoking_status']],
                          drop_first=True)

# Print new dataframe for ML modeling:
print(df2.head())

    age  avg_glucose_level   bmi  smoking_status  gender_Male  gender_Other  \
0   3.0              95.12  18.0               0            1             0   
1  58.0              87.96  39.2               0            1             0   
2   8.0             110.89  17.6               0            0             0   
3  70.0              69.04  35.9               1            0             0   
4  14.0             161.28  19.1               0            1             0   

   hypertension_Yes  heart_disease_Yes  ever_married_Yes  \
0                 0                  0                 0   
1                 1                  0                 1   
2                 0                  0                 0   
3                 0                  0                 1   
4                 0                  0                 0   

   work_type_Never_worked  work_type_Private  work_type_Self-employed  \
0                       0                  0                        0   
1                 