In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
import numpy as np
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv('healthcare-dataset-stroke-data.csv')

y = data['stroke']
X = data.drop('stroke', axis=1)

data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
X = X.drop('id', axis=1, errors='ignore')

numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

binary_num_cols = [
    col for col in numeric_cols 
    if set(X[col].dropna().unique()).issubset({0, 1})
]

for col in numeric_cols:
    if col in binary_num_cols:
        fill_value = X[col].mode()[0]  # Для бинарных [[1]]
    else:
        fill_value = X[col].mean()     # Для непрерывных [[4]]
    X[col] = X[col].fillna(fill_value)

In [4]:
X

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.600000,formerly smoked
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.500000,never smoked
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.400000,smokes
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.000000,never smoked
...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,28.893237,never smoked
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.000000,never smoked
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.600000,never smoked
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.600000,formerly smoked


In [5]:
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Заполняем категориальные модами [[3]]
for col in categorical_cols:
    X[col] = X[col].fillna(X[col].mode()[0])

# 2. Преобразование категориальных признаков
# Бинарные кодируем через LabelEncoder [[8]]
binary_cat_cols = ['gender', 'ever_married', 'Residence_type']
for col in binary_cat_cols:
    X[col] = LabelEncoder().fit_transform(X[col])

In [6]:
ct = ColumnTransformer(
    [('encoder', OneHotEncoder(), ['work_type', 'smoking_status'])],
    remainder='passthrough'
)
X_processed = ct.fit_transform(X)

In [7]:
scaler = StandardScaler()
X_processed = scaler.fit_transform(X_processed)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=50)

In [9]:
processed_df = pd.DataFrame(X_processed)
processed_df['stroke'] = y.values
processed_df.to_csv('stroke_data.csv', index=False)

In [10]:
model = SGDClassifier(
    class_weight={0:1, 1:40},
    loss='log_loss',            # Логистическая регрессия через SGD [[7]][[10]]
    penalty='l1',               # L2 регуляризация [[9]]
    alpha=0.001,                # Сила регуляризации
    max_iter=100000,
    early_stopping=True,        # Ранняя остановка [[1]]
    n_iter_no_change=1000,
    random_state=42,
    learning_rate='optimal'
)
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.63      0.77       974
           1       0.10      0.85      0.18        48

    accuracy                           0.64      1022
   macro avg       0.55      0.74      0.48      1022
weighted avg       0.95      0.64      0.74      1022



In [12]:
from xgboost import XGBClassifier

In [28]:
xgb_model = XGBClassifier(
    scale_pos_weight=20,  # Auto-balance classes
    objective='binary:logistic',
    n_estimators=200,
    subsample=0.8,
    colsample_bytree=0.7,
    max_depth=16
)

In [29]:
xgb_model.fit(X_train, y_train)

In [30]:
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       974
           1       0.29      0.21      0.24        48

    accuracy                           0.94      1022
   macro avg       0.63      0.59      0.61      1022
weighted avg       0.93      0.94      0.93      1022



In [None]:
import optuna
from sklearn.metrics import classification_report, f1_score

def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 5, 40),
        'n_estimators': trial.suggest_int('n_estimators', 3, 20),
        'learning_rate': trial.suggest_float('learning_rate', 0.00001, 0.003, log=True),
        'max_depth': trial.suggest_int('max_depth', 5, 200),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    model = XGBClassifier(**params, eval_metric='logloss')
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    return report['macro avg']['f1-score']

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, show_progress_bar=True)

# Лучшие параметры
print("Best trial params:", study.best_params)

In [53]:
study.best_params

{'scale_pos_weight': 19,
 'n_estimators': 12,
 'learning_rate': 0.0023619271510423806,
 'max_depth': 173,
 'subsample': 0.8406889027799612,
 'colsample_bytree': 0.8561840387209951,
 'reg_alpha': 1.1014484338486523e-07,
 'reg_lambda': 2.6091892188256777e-05,
 'min_child_weight': 10}

In [54]:
# Финальная модель
best_xgb = XGBClassifier(**study.best_params, eval_metric='logloss')
best_xgb.fit(X_train, y_train)

In [55]:
y_pred = best_xgb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       974
           1       0.26      0.40      0.32        48

    accuracy                           0.92      1022
   macro avg       0.62      0.67      0.64      1022
weighted avg       0.94      0.92      0.93      1022

