In [2]:
import numpy as np 
import pandas as pd 
import sklearn
import matplotlib.pyplot as plt

# Проводим базовый EDA при помощи ydata-profiling

In [5]:
df = pd.read_csv('cirrhosis.csv')

In [6]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Profiling Report")

In [7]:
import ipywidgets

profile.to_file("original.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: 'Y'')
  annotation = ("{:" + self.fmt + "}").format(val)
(using `df.profile_report(missing_diagrams={"Heatmap": False}`)
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: '--'')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
df.drop('ID', axis=1, inplace=True)


In [9]:
df.drop('Drug', axis=1, inplace=True)

In [10]:
df.dropna(inplace=True)

In [11]:
df['Sex'] = df['Sex'].replace({'F': 0, 'M': 1})

  df['Sex'] = df['Sex'].replace({'F': 0, 'M': 1})


In [12]:
df1 = df[df['Status'] != 'CL']

In [13]:
df_preprocessed = pd.get_dummies(df1, columns=['Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Status'], drop_first=True)

In [14]:
df_preprocessed['Age'] = df_preprocessed['Age'] / 365.25

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
num_features = ['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']
df_preprocessed[num_features] = scaler.fit_transform(df_preprocessed[num_features])


In [17]:
df_preprocessed.to_excel('df_preprocessed.xlsx', index=False)

# Обучаем логистическую регрессию

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

In [19]:
# Определение признаков и целевой переменной
X = df_preprocessed.drop('Status_D', axis=1)
y = df_preprocessed['Status_D']

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Обучение модели логистической регрессии
model_logistic = LogisticRegression(max_iter=1000)
model_logistic.fit(X_train, y_train)

# Предсказания на тестовой выборке
y_pred_logistic = model_logistic.predict(X_test)

# Оценка модели
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
precision_logistic = precision_score(y_test, y_pred_logistic)
recall_logistic = recall_score(y_test, y_pred_logistic)
f1_logistic = f1_score(y_test, y_pred_logistic)
roc_auc_logistic = roc_auc_score(y_test, y_pred_logistic)

In [23]:
print("Logistic regression model results:")
print(f"Accuracy: {accuracy_logistic:.3f}")
print(f"Точность (Precision): {precision_logistic:.3f}")
print(f"Полнота (Recall): {recall_logistic:.3f}")
print(f"F1: {f1_logistic:.3f}")
print(f"Площадь под ROC-кривой (AUC-ROC): {roc_auc_logistic:.3f}\n")

Logistic regression model results:
Accuracy: 0.808
Точность (Precision): 0.941
Полнота (Recall): 0.640
F1: 0.762
Площадь под ROC-кривой (AUC-ROC): 0.801



# Обучаем RandomForest

In [24]:
from sklearn.ensemble import RandomForestClassifier

# Обучение модели случайного леса
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Предсказания на тестовой выборке
y_pred_rf = rf_model.predict(X_test)

# Оценка модели случайного леса
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_rf)

print("Random Forest baseline model results:")
print(f"Accuracy: {accuracy_rf:.3f}")
print(f"Точность (Precision): {precision_rf:.3f}")
print(f"Полнота (Recall): {recall_rf:.3f}")
print(f"F1: {f1_rf:.3f}")
print(f"Площадь под ROC-кривой (AUC-ROC): {roc_auc_rf:.3f}\n")


Random Forest baseline model results:
Accuracy: 0.769
Точность (Precision): 0.882
Полнота (Recall): 0.600
F1: 0.714
Площадь под ROC-кривой (AUC-ROC): 0.763



In [25]:
from sklearn.model_selection import GridSearchCV

# Определение параметров для GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Создание модели случайного леса
rf = RandomForestClassifier(random_state=42)

# Поиск по сетке с кросс-валидацией
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='roc_auc')
grid_search.fit(X_train, y_train)

# Лучшие параметры и лучший результат
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Лучшие параметры:", best_params)
print("Лучший ROC AUC:", best_score)


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Лучшие параметры: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Лучший ROC AUC: 0.8842775041050902


In [26]:
# Создание и обучение модели случайного леса с лучшими параметрами
rf_best = RandomForestClassifier(
    max_depth=None,
    min_samples_leaf=1,
    min_samples_split=10,
    n_estimators=50,
    random_state=42
)
rf_best.fit(X_train, y_train)

# Предсказания на тестовой выборке
y_pred_rf_best = rf_best.predict(X_test)

# Оценка модели
accuracy_rf_best = accuracy_score(y_test, y_pred_rf_best)
precision_rf_best = precision_score(y_test, y_pred_rf_best)
recall_rf_best = recall_score(y_test, y_pred_rf_best)
f1_rf_best = f1_score(y_test, y_pred_rf_best)
roc_auc_rf_best = roc_auc_score(y_test, y_pred_rf_best)

print("Random Forest optimized model results:")
print(f"Accuracy: {accuracy_rf_best:.3f}")
print(f"Точность (Precision): {precision_rf_best:.3f}")
print(f"Полнота (Recall): {recall_rf_best:.3f}")
print(f"F1: {f1_rf_best:.3f}")
print(f"Площадь под ROC-кривой (AUC-ROC): {roc_auc_rf_best:.3f}\n")


Random Forest optimized model results:
Accuracy: 0.788
Точность (Precision): 0.889
Полнота (Recall): 0.640
F1: 0.744
Площадь под ROC-кривой (AUC-ROC): 0.783



# Обучаем XGBoost

In [28]:
import xgboost as xgb

# Создание и обучение модели XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Предсказания на тестовой выборке
y_pred_xgb = xgb_model.predict(X_test)

# Оценка модели
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
roc_auc_xgb = roc_auc_score(y_test, y_pred_xgb)

In [30]:
print("XGBoost baseline model results:")
print(f"Accuracy: {accuracy_xgb:.3f}")
print(f"Точность (Precision): {precision_xgb:.3f}")
print(f"Полнота (Recall): {recall_xgb:.3f}")
print(f"F1: {f1_xgb:.3f}")
print(f"Площадь под ROC-кривой (AUC-ROC): {roc_auc_xgb:.3f}\n")

XGBoost baseline model results:
Accuracy: 0.731
Точность (Precision): 0.824
Полнота (Recall): 0.560
F1: 0.667
Площадь под ROC-кривой (AUC-ROC): 0.724



In [31]:
import optuna

# Функция для оптимизации
def objective(trial):
    # Гиперпараметры для оптимизации
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'booster': 'gbtree',
        'use_label_encoder': False,
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'eta': trial.suggest_float('eta', 1e-3, 0.1, log=True),
        'gamma': trial.suggest_float('gamma', 1e-3, 10.0, log=True),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }
    
    # Создание и обучение модели
    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    
    # Предсказания на тестовой выборке
    preds = model.predict(X_test)
    
    # Расчет ROC AUC
    roc_auc = roc_auc_score(y_test, preds)
    
    return roc_auc

# Создание объекта исследования и выполнение оптимизации
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=300)

# Вывод лучших гиперпараметров
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-02-21 23:45:39,495] A new study created in memory with name: no-name-f8a89739-9b5f-44bf-afbf-13fb0324458e
[I 2024-02-21 23:45:39,592] Trial 0 finished with value: 0.74 and parameters: {'lambda': 0.02347127751955169, 'alpha': 0.14917070517637426, 'max_depth': 4, 'eta': 0.003652024992959075, 'gamma': 0.0036082607442728333, 'grow_policy': 'depthwise', 'subsample': 0.8954905940958865, 'colsample_bytree': 0.7408987980970454}. Best is trial 0 with value: 0.74.
[I 2024-02-21 23:45:39,698] Trial 1 finished with value: 0.58 and parameters: {'lambda': 0.02041487762227747, 'alpha': 0.004617441764618447, 'max_depth': 9, 'eta': 0.0019266475772732928, 'gamma': 0.0038161788363382776, 'grow_policy': 'lossguide', 'subsample': 0.6926330048326123, 'colsample_bytree': 0.5288623132220391}. Best is trial 0 with value: 0.74.
[I 2024-02-21 23:45:39,783] Trial 2 finished with value: 0.5 and parameters: {'lambda': 2.69892262959657, 'alpha': 0.12411549259090658, 'max_depth': 4, 'eta': 0.00171278773421877

Number of finished trials: 300
Best trial: {'lambda': 0.002228273294833405, 'alpha': 1.0878074115671539, 'max_depth': 9, 'eta': 0.092341735217733, 'gamma': 0.4792235660479775, 'grow_policy': 'lossguide', 'subsample': 0.5015376109722444, 'colsample_bytree': 0.8298918585470988}


In [34]:
best_params = study.best_trial.params


In [36]:
# Создание и обучение модели XGBoost с оптимизированными параметрами
optimized_params = best_params

xgb_optimized = xgb.XGBClassifier(**optimized_params)
xgb_optimized.fit(X_train, y_train)

# Предсказания на тестовой выборке
y_pred_xgb_optimized = xgb_optimized.predict(X_test)

# Оценка модели
accuracy_xgb_optimized = accuracy_score(y_test, y_pred_xgb_optimized)
precision_xgb_optimized = precision_score(y_test, y_pred_xgb_optimized)
recall_xgb_optimized = recall_score(y_test, y_pred_xgb_optimized)
f1_xgb_optimized = f1_score(y_test, y_pred_xgb_optimized)
roc_auc_xgb_optimized = roc_auc_score(y_test, y_pred_xgb_optimized)

In [37]:
print("XGBoost optimized model results:")
print(f"Accuracy: {accuracy_xgb_optimized:.3f}")
print(f"Точность (Precision): {precision_xgb_optimized:.3f}")
print(f"Полнота (Recall): {recall_xgb_optimized:.3f}")
print(f"F1: {f1_xgb_optimized:.3f}")
print(f"Площадь под ROC-кривой (AUC-ROC): {roc_auc_xgb_optimized:.3f}\n")

XGBoost optimized model results:
Accuracy: 0.846
Точность (Precision): 0.947
Полнота (Recall): 0.720
F1: 0.818
Площадь под ROC-кривой (AUC-ROC): 0.841



In [38]:
from joblib import dump

# Сохранение моделей
dump(model_logistic, 'logistic_regression_model.joblib')
dump(rf_best, 'random_forest_best_model.joblib')
dump(xgb_optimized, 'xgboost_optimized_model.joblib')


['xgboost_optimized_model.joblib']