In [121]:
import pandas as pd 
import numpy as np
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from matplotlib import pyplot as plt

In [122]:
df = pd.read_csv('./raw_data/dataset-stroke.csv').drop(columns=['id'])

In [123]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [124]:
len(df[df.stroke==1])

249

In [125]:
# separating the data that we are trying to predict
labels = df.stroke.to_numpy()
df.drop(columns=['stroke'], inplace=True)

In [126]:
# categorical features
cat_ft = ['ever_married', 'work_type', 'Residence_type', 'smoking_status'] 
# numerical features
num_ft = list(df.drop(columns=cat_ft))

In [127]:
df.gender = df.gender.apply(lambda x: 1 if x == 'Male' else 2)

# Pipeline construction

### Feature Augmentation

In [128]:
from sklearn.base import BaseEstimator, TransformerMixin
from collections import defaultdict
class AugmentFeatures(BaseEstimator, TransformerMixin):
    '''
    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4681110/
    '''
    def __init__(self, fts): 
        self.fts = fts
        
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):
        gender_i = self.fts.index('gender')
        bmi_i = self.fts.index('bmi')
        age_i = self.fts.index('age')

        # collect bmi expectation for age ranges
        bmis = {}
        for age in X[:, age_i]:
            if age not in bmis:
                bmis[int(age)] = df[df.age.isin(range(int(age)-5, int(age)+5))].bmi.mean()

        # calculate squared difference from expected bmi
        diff_bmi = []
        for bmi in X[:, bmi_i]:
            diff_bmi.append(abs(bmi - bmis[int(age)])**2)

        # cross deviation from mean bmi with age
        dev_C_age = X[:, age_i] * diff_bmi

        return X

### Feature Imputation

In [129]:
# using 1 nearest neighbor to fill in missing value
imputer = KNNImputer(n_neighbors=2)

### Final Pipeline

In [130]:
num_pipe = Pipeline([
    ('imputer', imputer),
    ('ft_aug', AugmentFeatures(fts = num_ft)),
    ('std_scaler', StandardScaler())
])

full_pipe = ColumnTransformer([
    ("cat", OneHotEncoder(), cat_ft),
    ('num', num_pipe, num_ft) 
])

In [131]:
ft_train, ft_test, lbl_train, lbl_test = train_test_split(df, labels, test_size=0.20, stratify=labels)

In [132]:
features_full = full_pipe.fit_transform(df)

In [133]:
features_train = full_pipe.fit_transform(ft_train)
features_test = full_pipe.fit_transform(ft_test)

In [134]:
x = list(list(x) for x in full_pipe.named_transformers_["cat"].categories_)
feature_names = np.array(num_ft + x[0] + x[1] + x[2] + x[3])

## Save results of pipeline

In [135]:
np.save("./processed_data/features_train", features_train)
np.save("./processed_data/features_test", features_test)
np.save("./processed_data/features", features_full)
np.save("./processed_data/feature_names", feature_names)
np.save("./processed_data/labels_train", lbl_train)
np.save("./processed_data/labels_test", lbl_test)
np.save("./processed_data/labels", labels)