In [106]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.metrics import f1_score
#normalizing data
from sklearn.preprocessing import StandardScaler
#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score
#imputer
from sklearn.impute import SimpleImputer

import sklearn.datasets
import json

In [107]:
df = pd.read_csv("./full_data.csv")
df.head(4)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [108]:
# convert to digit categorical features
def mapping_val(column_name):
    dict_val = df.value_counts(column_name).to_dict()
    
    i = 0
    for k in dict_val:
        dict_val[k] = i
        i += 1
    
    df[column_name] = df[column_name].map(dict_val)

In [109]:
# get and save category features 
def get_category(column_name):
    dict_val = df.value_counts(column_name).to_dict()
    
    i = 0
    for k in dict_val:
        dict_val[k] = i
        i += 1
    with open(column_name, 'w') as f:
        json.dump(dict_val, f)

In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB


пустых значений нет - это очень хорошо.

In [111]:
df.value_counts('stroke')

stroke
0    4733
1     248
dtype: int64

У нас дисбаланс классов, чтобы решить эту проблему будем использовать модель RandomForest, так как модели на базе деревьев показывают хорошие результаты обучения даже в таких случая, когда один класс значительно преобладает в наборе.

Обработку данных построим по следующему плану:
1. gender, ever_married, work_type, Residence_type, smoking_status - onehot encoding

В качестве модели будем использовать RandomForestClassifier


In [112]:
categorical_features = ['hypertension', 'heart_disease']
number_features = ['age', 'avg_glucose_level', 'bmi']
onehot_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
target = 'stroke'

In [113]:
for cat_col in onehot_features:
    get_category(cat_col)

In [114]:
for cat_col in onehot_features:
    mapping_val(cat_col)

In [115]:
df.head(15)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,0,0,0,228.69,36.6,2,1
1,1,80.0,0,1,0,0,1,105.92,32.5,0,1
2,0,49.0,0,0,0,0,0,171.23,34.4,3,1
3,0,79.0,1,0,0,1,1,174.12,24.0,0,1
4,1,81.0,0,0,0,0,0,186.21,29.0,2,1
5,1,74.0,1,1,0,0,1,70.09,27.4,0,1
6,0,69.0,0,0,1,0,0,94.39,22.8,0,1
7,0,78.0,0,0,0,0,0,58.57,24.2,1,1
8,0,81.0,1,0,0,0,1,80.43,29.7,0,1
9,0,61.0,0,1,0,3,1,120.46,36.8,3,1


In [116]:
scaler = StandardScaler()

class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, x):
        return x[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        x[self.column] = scaler.fit_transform(self.column)
        return x

class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [117]:
x_train, x_test, y_train, y_test = train_test_split(df[categorical_features + number_features + onehot_features], 
                                                    df[target], test_size=0.30, random_state=42)

In [118]:
x_train, y_train

(      hypertension  heart_disease    age  avg_glucose_level   bmi  gender  \
 4260             0              0  56.00              98.14  32.7       0   
 1658             0              0  35.00              82.81  23.9       1   
 1151             0              0  15.00              78.59  25.1       1   
 3975             0              0  55.00              80.35  28.7       1   
 647              0              0  13.00              63.26  19.5       0   
 ...            ...            ...    ...                ...   ...     ...   
 4426             0              0  54.00             231.54  29.9       0   
 466              0              0  38.00              70.92  41.6       0   
 3092             0              0  13.00              90.60  16.9       1   
 3772             0              0   0.48             118.75  17.4       0   
 860              0              0  57.00             128.28  34.2       0   
 
       ever_married  work_type  Residence_type  smoking_status

In [153]:
pipe = Pipeline([('scaler', StandardScaler()),('classifier', RandomForestClassifier(random_state=42))])

In [154]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier', RandomForestClassifier(random_state=42))])

In [164]:
score = pipe.score(x_test, y_test)
score

0.945819397993311