In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb


from path_setup import setup_paths
setup_paths()

In [44]:
from dataset_transformer import BaseDatasetTransform
from base_model_train import BaseModelTrain
from model_evaluator import ModelEvaluator

In [45]:
df = pd.read_csv('../datasets/healthcare-stroke-data.csv')

In [46]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [47]:
df.shape

(5110, 12)

In [48]:
bdf = BaseDatasetTransform(df,target='stroke')

In [49]:
df,categorical_features = bdf.fit_transform()


Удаление колонок с именем "id": ['id']
-------------------------------------------

В колонке stroke нет пропущенных значений
-------------------------------------------
В наборе данных есть пропущенные значения:
Колонки с пропущенными значениями:
bmi    201
dtype: int64
Все пропущенные значения заполнены.
-------------------------------------------
Информация о колонках в датасете

Категориальные колонки:
Index(['gender', 'ever_married', 'work_type', 'Residence_type',
       'smoking_status'],
      dtype='object')

Числовые колонки:
Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'stroke'],
      dtype='object')


In [50]:
X_train, X_test, y_train, y_test  = bdf.get_train_test_split()

Количество значений целевой переменной по категориям
stroke
0    4861
1     249
Name: count, dtype: int64
Следует ли выполнить стратифицированное раздеение на обучающую и тестовую выборку? y/n


 y


Разделение датасета выполнено успешно


In [51]:
X_train

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
2226,Female,52.0,0,0,Yes,Govt_job,Rural,107.84,22.0,formerly smoked
3927,Female,62.0,0,0,Yes,Private,Urban,88.32,36.3,Unknown
3358,Male,81.0,0,1,No,Self-employed,Rural,95.49,29.4,Unknown
4152,Male,55.0,0,0,Yes,Self-employed,Rural,73.57,28.0,smokes
4866,Female,37.0,0,0,Yes,Private,Urban,103.66,36.1,smokes
...,...,...,...,...,...,...,...,...,...,...
1434,Female,45.0,0,0,Yes,Private,Urban,92.86,35.1,formerly smoked
461,Female,16.0,0,0,No,children,Rural,113.47,19.5,Unknown
1052,Female,61.0,0,0,Yes,Private,Rural,78.65,36.2,formerly smoked
152,Female,80.0,0,0,Yes,Self-employed,Urban,76.57,34.1,never smoked


In [52]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

def preprocess_and_resample(X_train, y_train, X_test, categorical_cols, random_state=42):

    X_train_copy = X_train.copy()
    X_test_copy = X_test.copy()
    

    label_encoders = {col: LabelEncoder() for col in categorical_cols if col in X_train_copy.columns}
    
    for col, le in label_encoders.items():
        X_train_copy[col] = le.fit_transform(X_train_copy[col])
        X_test_copy[col] = le.transform(X_test_copy[col])

    smote = SMOTE(random_state=random_state)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_copy, y_train)

    X_train_resampled_df = pd.DataFrame(X_train_resampled, columns=X_train_copy.columns)
    X_test_processed_df = X_test_copy.copy()

    print("Shape of resampled X_train:", X_train_resampled_df.shape)
    print("Shape of processed X_test:", X_test_processed_df.shape)

    return X_train_resampled_df, y_train_resampled, X_test_processed_df




In [70]:
from sklearn.utils.class_weight import compute_class_weight 
#is_unbalance=True

class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

class_counts = y_train.value_counts()


positive_class_count = class_counts[1]  
negative_class_count = class_counts[0]  

scale_pos_weight = negative_class_count / positive_class_count

lgbm_model = lgb.LGBMClassifier(verbose=-1,class_weight=class_weights_dict,scale_pos_weight=scale_pos_weight)

#lgbm_model = lgb.LGBMClassifier(verbose=-1)


columns_to_delete = []  


model_train = BaseModelTrain(
    model=lgbm_model,
    columns_to_delete=columns_to_delete,
    categorical_features=categorical_features,
    boosting_type='gbdt',
    objective='binary',
    metric='binary_error'
)

In [72]:
metrics = ['accuracy', 'precision', 'recall', 'f1_macro']
evaluator = ModelEvaluator(model_trainer=model_train, metrics=metrics)

evaluator.fit(X_train, y_train)

In [74]:
evaluator.evaluate_to_dataframe(X_test, y_test)

Unnamed: 0,Metric,Score
0,accuracy,0.858447
1,precision,0.580296
2,recall,0.7169
3,f1,0.600294
