<a href="https://colab.research.google.com/github/darsigov92/assistent/blob/main/%D0%9A%D0%BE%D0%BF%D0%B8%D1%8F_%D0%B1%D0%BB%D0%BE%D0%BA%D0%BD%D0%BE%D1%82%D0%B0_%22PerfAssist_Complete_Gradio_(1)_ipynb%22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Установка библиотек
!pip install gspread oauth2client gradio scikit-learn pandas --quiet

In [None]:
# Импорты и подключение к Google Таблице
import gradio as gr
import pandas as pd
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from sklearn.linear_model import LogisticRegression
import numpy as np

# Настройка доступа
SHEET_NAME = "patient_database"
WORKSHEET_NAME = "Лист1"
scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_name("your-credentials.json", scope)
client = gspread.authorize(creds)
sheet = client.open(SHEET_NAME).worksheet(WORKSHEET_NAME)

# Получение и сохранение данных
def get_data():
    # Ensure we get header and all data
    data = sheet.get_all_values()
    if not data:
        return pd.DataFrame()
    headers = data[0]
    return pd.DataFrame(data[1:], columns=headers)

sheet_columns = sheet.row_values(1)
def save_patient(data_dict):
    df = get_data()
    patient_id = data_dict.get("ID пациента", "").strip() # Get ID and remove leading/trailing whitespace

    if patient_id and patient_id in df["ID(пациент)"].astype(str).values: # Use actual sheet column name
        # Find the row index (gspread is 1-indexed, header is row 1)
        # .astype(str) is used to handle potential type mismatches between input and sheet data
        try:
            row_index = df[df["ID(пациент)"].astype(str) == patient_id].index[0] + 2 # Use actual sheet column name
            values = [str(data_dict.get(col, "")) for col in sheet_columns]
            sheet.update_row(row_index, values)
            return f"Пациент с ID {patient_id} обновлён."
        except Exception as e:
            return f"Ошибка при обновлении пациента: {e}"

    else:
        try:
            values = [str(data_dict.get(col, "")) for col in sheet_columns]
            sheet.append_row(values)
            return "Новый пациент сохранён."
        except Exception as e:
            return f"Ошибка при сохранении нового пациента: {e}"


# Обучение и прогноз
model = None
# Updated X_columns based on user request and likely sheet column names
# Using the reduced list of features requested by the user
X_columns_sheet = [
    "возраст на момент операции",
    "ФВЛЖ",
    "ИМТ кг/м2",
    "креатинин ммоль/л",
    "гемоглобин г/л"
]

# Keep only those columns that actually exist in sheet_columns
X_columns_sheet = [col for col in X_columns_sheet if col in sheet_columns]

def train_model():
    global model
    df = get_data()

    # --- Data Cleaning and Preparation ---
    # Convert numeric columns first, coercing errors
    # Only include the selected numeric columns
    numeric_cols_for_training = [
         "возраст на момент операции", "ФВЛЖ", "ИМТ кг/м2",
         "креатинин ммоль/л", "гемоглобин г/л"
    ]
    for col in numeric_cols_for_training:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # No categorical features in the new list, so skip one-hot encoding

    # Define the target column (assuming it's 'Исход')
    y_column_sheet = "Исход"
    if y_column_sheet not in df.columns: # Check in original df before processing
        return f"Ошибка: Колонка '{y_column_sheet}' не найдена в данных."

    # Convert target to numeric
    df[y_column_sheet] = pd.to_numeric(df[y_column_sheet], errors='coerce')


    # Drop rows with missing target variable
    initial_rows = len(df)
    df_processed = df.dropna(subset=[y_column_sheet]).copy() # Use df_processed for cleaned data

    if len(df_processed) == 0:
         return f"Недостаточно данных для обучения: целевая переменная '{y_column_sheet}' полностью отсутствует или содержит только пропущенные значения после очистки."


    # Define the features (X) from the processed DataFrame
    # Use the reduced list of features
    X = df_processed[X_columns_sheet]
    y = df_processed[y_column_sheet]

    # Drop rows with missing values in feature columns after selecting features
    X = X.dropna()
    y = y.loc[X.index] # Align y with cleaned X


    if len(X) < 10:
        return f"Недостаточно данных для обучения после очистки и обработки (осталось {len(X)} полных записей, требуется минимум 10)."

    # Ensure target variable has at least two classes
    if y.nunique() < 2:
        return f"Недостаточно данных для обучения: целевая переменная '{y_column_sheet}' содержит только одно уникальное значение после очистки."


    # --- Model Training ---
    model = LogisticRegression(max_iter=1000) # Increased max_iter for convergence
    try:
        model.fit(X, y)
        # Store the list of feature names used by the model
        model.feature_names_in_ = X.columns.tolist()
        return f"Модель обучена на {len(X)} пациентах с использованием {len(model.feature_names_in_)} признаков."
    except Exception as e:
        model = None # Ensure model is None if training fails
        return f"Ошибка при обучении модели: {e}"


# Update predict_outcome to handle the reduced feature set
def predict_outcome(*args):
    global model
    if model is None:
        return "Сначала обучите модель."

    # Get the list of features the model was trained on
    try:
        trained_feature_names = model.feature_names_in_
    except AttributeError:
        return "Ошибка: Модель не сохранила имена признаков, использованных при обучении."

    # Define the order of expected input arguments based on the new, reduced feature list
    # This list must match the order of inputs passed to predict.click in the Gradio interface
    original_predict_features = [
        "Возраст", "ФВ ЛЖ (%)", "ИМТ (кг/м²)", "Креатинин (мкмоль/л)", "Гемоглобин (г/л)"
    ] # These should match the labels in the Gradio UI for the inputs to predict

    # Ensure the number of input arguments matches the expected number of features
    if len(args) != len(original_predict_features):
         return f"Ошибка прогнозирования: Ожидается {len(original_predict_features)} входных значений, получено {len(args)}."


    # Create a pandas DataFrame from the input arguments, using the original UI labels as columns
    input_data = pd.DataFrame([args], columns=original_predict_features)

    # Map the UI labels to the sheet column names used for training
    # This mapping is crucial for aligning input data with the model's expected features
    ui_to_sheet_for_predict = {
        "Возраст": "возраст на момент операции",
        "ФВ ЛЖ (%)": "ФВЛЖ",
        "ИМТ (кг/м²)": "ИМТ кг/м2",
        "Креатинин (мкмоль/л)": "креатинин ммоль/л",
        "Гемоглобин (г/л)": "гемоглобин г/л"
    }

    # Create the processed input DataFrame using the sheet column names
    # Ensure columns are in the same order as trained_feature_names
    input_processed = pd.DataFrame(columns=trained_feature_names)

    for ui_label, sheet_name in ui_to_sheet_for_predict.items():
        if ui_label in input_data.columns and sheet_name in trained_feature_names:
             # Convert to numeric, coercing errors
             input_processed[sheet_name] = pd.to_numeric(input_data[ui_label], errors='coerce')
        elif sheet_name in trained_feature_names:
             # If a feature expected by the model is not in the UI inputs (shouldn't happen with correct mapping),
             # or if there's a mismatch, handle it (e.g., set to NaN or a default value).
             # Given the direct mapping, this case indicates an issue with the mapping or inputs.
             # For now, let's ensure the column exists and will be handled by dropna if needed.
             if sheet_name not in input_processed.columns:
                  input_processed[sheet_name] = np.nan # Or some default/imputation

    # Ensure all trained features are present in the input_processed DataFrame, add if missing (value 0 for dummy, NaN for numeric)
    # Since we only have numeric features now, missing features should be NaN or handled by dropna
    missing_cols = set(trained_feature_names) - set(input_processed.columns)
    for c in missing_cols:
        input_processed[c] = np.nan # Add missing numeric columns as NaN

    # Ensure the order of columns is the same as in training
    input_processed = input_processed[trained_feature_names]


    # Handle potential NaNs in the processed input data before prediction
    # For simplicity, let's drop the row if any required feature is NaN.
    # A more robust solution would use imputation based on training data.
    if input_processed.isnull().any().any():
         return "Ошибка: Введены некорректные или отсутствуют значения для всех необходимых признаков."

    # Perform prediction
    try:
        # Ensure input is float type as expected by model
        x = input_processed.values.astype(float)

        # Predict the probability, specifically for the positive class (index 1)
        # Use [:, 1] to handle cases where predict_proba might return a single row for a single sample
        prob = model.predict_proba(x)[:, 1]

        # Check if prob is a single value (as expected for a single input row)
        if prob.shape[0] > 0:
             return f"Вероятность неблагоприятного исхода: {round(prob[0] * 100, 2)}%"
        else:
             return "Не удалось получить предсказание вероятности."

    except ValueError:
        return "Ошибка: Введите числовые значения для прогноза."
    except Exception as e:
        return f"Произошла ошибка при прогнозировании: {e}. Убедитесь, что входные данные соответствуют признакам, на которых обучалась модель."

In [None]:
# Интерфейс Gradio с блоками и всеми полями
css = """
body {
    background: linear-gradient(to right, #e0f2f7, #bbdefb); /* Light blue gradient */
    color: #0d47a1; /* Dark blue text */
    font-family: 'Arial', sans-serif;
}
h1 {
    color: #0d47a1; /* Dark blue header */
    text-align: center;
}
.gradio-container {
    border: 1px solid #e0f0f9; /* Light border */
    box-shadow: 0 4px 8px rgba(0,0,0,0.1); /* Subtle shadow */
    border-radius: 8px;
    background-color: #ffffff; /* White background for container */
}
.gradio-tabs {
    background-color: #e3f2fd; /* Lighter blue for tabs */
}
.gradio-tab {
    color: #1565c0; /* Blue tab text */
}
button {
    background-color: #1e88e5 !important; /* Blue buttons */
    color: white !important;
    border: none !important;
    border-radius: 4px !important;
    padding: 10px 20px !important;
    font-size: 16px !important;
    cursor: pointer !important;
    transition: background-color 0.3s ease !important;
}
button:hover {
    background-color: #1565c0 !important; /* Darker blue on hover */
}
label {
    font-weight: bold;
    color: #0d47a1; /* Dark blue labels */
}
input[type="text"], input[type="number"], textarea, select {
    border: 1px solid #bbdefb; /* Light blue border for inputs */
    border-radius: 4px;
    padding: 8px;
    box-sizing: border-box;
}
"""
with gr.Blocks(css=css) as demo:
    gr.Markdown("# Система поддержки принятия врачебных решений на основе машинного обучения")
    with gr.Tab("Общие данные"):
        id = gr.Text(label="ID пациента")
        age = gr.Number(label="Возраст")
        sex = gr.Radio(["М", "Ж"], label="Пол")
        bmi = gr.Number(label="ИМТ (кг/м²)")
        emergency = gr.Radio(["Да", "Нет"], label="Экстренность", value="Нет") # Changed 1/0 to Да/Нет
        surgery_date = gr.Text(label="Дата операции")
        cpb_group = gr.Radio(["ИК", "Без ИК"], label="Группа")
        diabetes = gr.Radio(["Да", "Нет"], label="СД 2 типа", value="Нет") # Changed 1/0 to Да/Нет
        smoking = gr.Number(label="Курение (пачка/лет)")
        euroscore = gr.Number(label="EuroSCORE II (%)")

    with gr.Tab("Система кровообращения"):
        ef = gr.Number(label="ФВ ЛЖ (%)")
        af_type = gr.Radio(["ФП", "Пароксизмы", "Нет"], label="ФП / Пароксизмы")
        parox_count = gr.Number(label="Кол-во пароксизмов")
        angina_fc = gr.Radio(["I", "II", "III", "IV"], label="ФК ИБС")
        chf_stage = gr.Text(label="Стадия ХСН")
        chf_fc = gr.Radio(["1", "2", "3", "4"], label="ФК ХСН (NYHA)")
        bca_as = gr.Radio(["Да", "Нет"], label="Атеросклероз БЦА", value="Нет") # Changed Да/Нет
        leg_vessel_as = gr.Radio(["Да", "Нет"], label="Сосуды нижних конечностей", value="Нет") # Changed Да/Нет
        lv_aneurysm = gr.Radio(["Да", "Нет"], label="Аневризма ЛЖ", value="Нет") # Changed Да/Нет
    with gr.Tab("Лабораторные данные"):
        nt_pro_bnp = gr.Number(label="Натрий-уретический пептид (пг/мл)")
        syst_pa = gr.Text(label="СДЛА (мм рт. ст.)")
        edv = gr.Number(label="КДО (мл)")
        protein = gr.Number(label="Общий белок (г/л)")
        urea = gr.Number(label="Мочевина (ммоль/л)")
        creatinine = gr.Number(label="Креатинин (мкмоль/л)")
        ckd_stage = gr.Text(label="ХБП стадия")
        gfr = gr.Number(label="СКФ (мл/мин)")
        hemoglobin = gr.Number(label="Гемоглобин (г/л)")
    with gr.Tab("Прогноз и Сохранение"):
        output = gr.Textbox(label="Результат")
        save = gr.Button("💾 Сохранить пациента")
        train = gr.Button("🧠 Обучить модель")
        predict = gr.Button("📈 Прогноз")


    # Define the order of features expected by the model for prediction
    # This list must match the order of inputs passed to predict.click
    # and correspond to the original feature names used in predict_outcome
    # Updated to the reduced list: возраст, ФВЛЖ, ИМТ, креатинин, гемоглобин
    predict_inputs_order = [age, ef, bmi, creatinine, hemoglobin]


    inputs = [age, ef, bmi, diabetes, creatinine, gfr, euroscore]
    save.click(lambda *vals: save_patient(dict(zip(sheet_columns, vals))),
              inputs=[id, age, ef, cpb_group, sex, bmi, emergency, surgery_date, diabetes, af_type, parox_count, angina_fc, chf_stage, chf_fc, bca_as, leg_vessel_as, lv_aneurysm, nt_pro_bnp, syst_pa, edv, protein, urea, creatinine, ckd_stage, gfr, hemoglobin, smoking, euroscore],
              outputs=output)
    train.click(fn=train_model, outputs=output)
    predict.click(fn=predict_outcome, inputs=predict_inputs_order, outputs=output) # Use the updated input order for prediction

demo.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f1095b6e3a145b075d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




После обучения модели, вы можете использовать её для прогнозирования. Например, вот как можно сделать предсказание для нового пациента с заданными параметрами.

In [None]:
# Пример использования обученной модели для прогнозирования
# Убедитесь, что модель обучена, прежде чем запускать этот код.
# Вызовите train_model() или нажмите кнопку "Обучить модель" в интерфейсе Gradio.

if model is not None:
    # Пример данных для нового пациента (в том же порядке, что и X_columns)
    # ["Возраст", "ФВ ЛЖ", "ИМТ", "СД", "Креатинин", "СКФ", "EuroSCORE II"]
    new_patient_data = np.array([[65, 45, 28, 1, 1.2, 75, 5.5]])

    # Прогнозирование вероятности неблагоприятного исхода
    predicted_probability = model.predict_proba(new_patient_data)[0][1]

    print(f"Прогнозируемая вероятность неблагоприятного исхода для нового пациента: {round(predicted_probability * 100, 2)}%")
else:
    print("Модель еще не обучена. Пожалуйста, сначала обучите модель.")

Модель еще не обучена. Пожалуйста, сначала обучите модель.


# Task
Разработать интерфейс Gradio для системы поддержки принятия врачебных решений на основе машинного обучения с кардиологической тематикой. Интерфейс должен включать вкладки "Общие данные", "Система кровообращения", "Лабораторные данные" и "Анализ". Вкладка "Общие данные" должна содержать все поля ввода данных пациента, кроме "Дата рождения", "Хирург", "Примечания", "Исход" и "Мини доступ". Вкладка "Анализ" должна содержать кнопки "Сохранить пациента", "Обучить модель" и "Прогноз".

## Анализ данных

### Subtask:
Провести более глубокий анализ существующих данных, включая визуализацию распределения признаков и их взаимосвязи с целевой переменной ("Исход").


**Reasoning**:
Load the data using the provided function and then display basic information about the dataframe, including the columns, their types, and the number of non-null values to understand the data structure and identify missing values as requested in step 7.



In [None]:
df = get_data()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 40 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   ID(пациент)                             15 non-null     object
 1   возраст на момент операции              15 non-null     object
 2   ФВЛЖ                                    15 non-null     object
 3   Группа (с ИК , без ИК)                  15 non-null     object
 4   дата рождения                           15 non-null     object
 5   пол                                     15 non-null     object
 6   ИМТ кг/м2                               15 non-null     object
 7   Экстренноепплан=1, экстр.=0             15 non-null     object
 8   дата операц                             15 non-null     object
 9   СД (Да=1/Нет=0)                         15 non-null     object
 10  НРС Фп или пароксизмы                   15 non-null     object
 11  пикс ско

**Reasoning**:
Based on the `df.info()` output, all columns are of type 'object', which indicates that the numerical data needs to be converted to a numerical type for statistical analysis and visualization. Also, some column names contain special characters or spaces and should be cleaned for easier access. Finally, check the unique values in the target variable 'Исход' to understand its distribution.



In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Clean column names
df.columns = df.columns.str.strip().str.replace('[^A-Za-z0-9_]+', '_', regex=True)

# Convert relevant columns to numeric, coercing errors
numeric_cols = [
    'возраст_на_момент_операции', 'ФВЛЖ', 'ИМТ_кг_м2',
    'СД_Да_1_Нет_0', 'пикс_сколько_раз', 'НАТРИЙ_УРЕТИЧЕСКИЙ_ПЕПТИД_ПГ_МЛ_',
    'СДЛАесли_N_то_25_если_больше_то_цифра', 'КДО', 'общий_белок', 'мочевина',
    'креатинин_ммоль_л', 'скф_мл_мин', 'гемоглобин_г_л', 'курение_сигарет_в_день',
    'EuroSCORE_II_в_', 'Исход'
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Display basic statistics for numerical columns (Step 2)
display(df[numeric_cols].describe())

# Check the distribution of the target variable (part of step 7 and preparation for step 4)
print("\nDistribution of 'Исход':")
display(df['Исход'].value_counts(dropna=False))

KeyError: 'возраст_на_момент_операции'

**Reasoning**:
The previous command failed because the cleaned column names were not correctly matched with the `numeric_cols` list. I need to re-clean the column names and then use the actual cleaned names to convert the columns to numeric types and perform the descriptive statistics.



In [None]:
# Re-clean column names to ensure consistency
df.columns = df.columns.str.strip().str.replace('[^a-zA-Z0-9_]+', '_', regex=True)

# Update the list of numeric columns with the cleaned names
numeric_cols = [
    'возраст_на_момент_операции', 'ФВЛЖ', 'ИМТ_кг_м2',
    'СД_Да_1_Нет_0', 'пикс_сколько_раз', 'НАТРИЙ_УРЕТИЧЕСКИЙ_ПЕПТИД_ПГ_МЛ_',
    'СДЛАесли_N_то_25_если_больше_то_цифра', 'КДО', 'общий_белок', 'мочевина',
    'креатинин_ммоль_л', 'скф_мл_мин', 'гемоглобин_г_л', 'курение_сигарет_в_день',
    'EuroSCORE_II_в_', 'Исход'
]

# Convert relevant columns to numeric, coercing errors
for col in numeric_cols:
    # Check if column exists before attempting conversion
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    else:
        print(f"Warning: Column '{col}' not found in DataFrame.")


# Display basic statistics for numerical columns (Step 2)
display(df[numeric_cols].describe())

# Check the distribution of the target variable (part of step 7 and preparation for step 4)
print("\nDistribution of 'Исход':")
display(df['Исход'].value_counts(dropna=False))

# Identify columns with missing values (Step 7)
print("\nMissing values per column:")
display(df.isnull().sum())



KeyError: "None of [Index(['возраст_на_момент_операции', 'ФВЛЖ', 'ИМТ_кг_м2', 'СД_Да_1_Нет_0',\n       'пикс_сколько_раз', 'НАТРИЙ_УРЕТИЧЕСКИЙ_ПЕПТИД_ПГ_МЛ_',\n       'СДЛАесли_N_то_25_если_больше_то_цифра', 'КДО', 'общий_белок',\n       'мочевина', 'креатинин_ммоль_л', 'скф_мл_мин', 'гемоглобин_г_л',\n       'курение_сигарет_в_день', 'EuroSCORE_II_в_', 'Исход'],\n      dtype='object')] are in the [columns]"

**Reasoning**:
The column name cleaning was still not successful, leading to KeyErrors. I need to inspect the current column names of the DataFrame after the cleaning attempt to understand the actual column names and correct the `numeric_cols` list accordingly. Then, I will retry the numeric conversion and descriptive statistics.



In [None]:
# Display the current column names after cleaning
print("Current column names after cleaning:")
print(df.columns.tolist())

# Based on the output, correct the numeric_cols list
# (Assuming the output will reveal the correct cleaned names)
# Let's redefine numeric_cols based on the potential cleaned names observed from the previous error and info()
# Example corrected names (will adjust based on actual output):
corrected_numeric_cols = [
    'возраст_на_момент_операции', 'ФВЛЖ', 'ИМТ_кг_м2',
    'СД_Да_1_Нет_0', 'пикс_сколько_раз', 'НАТРИЙ_УРЕТИЧЕСКИЙ_ПЕПТИД_ПГ_МЛ_',
    'СДЛАесли_N_то_25_если_больше_то_цифра', 'КДО', 'общий_белок', 'мочевина',
    'креатинин_ммоль_л', 'скф_мл_мин', 'гемоглобин_г_л', 'курение_сигарет_в_день',
    'EuroSCORE_II_в_', 'Исход'
]

# Convert relevant columns to numeric, coercing errors using the corrected list
for col in corrected_numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    else:
        print(f"Warning: Column '{col}' not found in DataFrame after re-cleaning.")

# Filter for columns that were successfully converted to numeric
actual_numeric_cols = [col for col in corrected_numeric_cols if col in df.columns and pd.api.types.is_numeric_dtype(df[col])]


# Display basic statistics for the actual numerical columns (Step 2)
if actual_numeric_cols:
    display(df[actual_numeric_cols].describe())
else:
    print("No numeric columns found after conversion.")

# Check the distribution of the target variable (part of step 7 and preparation for step 4)
if 'Исход' in df.columns:
    print("\nDistribution of 'Исход':")
    display(df['Исход'].value_counts(dropna=False))
else:
    print("\n'Исход' column not found.")

# Identify columns with missing values (Step 7)
print("\nMissing values per column:")
display(df.isnull().sum())

Current column names after cleaning:
['ID_', '_', '_', '_', '_', '_', '_2', '_1_0', '_', '_1_0_', '_', '_', '_', '_', '_', '_', '_', '_N_25_', '_', '_', '_', '_', '_', '_CKD_EPI_', '_', '_', 'NYHA_1_2_3_4_', '_', 'EuroSCORE_II_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_']
No numeric columns found after conversion.

'Исход' column not found.

Missing values per column:


Unnamed: 0,0
ID_,0
_,0
_,0
_,0
_,0
_,0
_2,0
_1_0,0
_,0
_1_0_,0


**Reasoning**:
The previous attempts at cleaning column names and converting to numeric failed because the column names were not consistently mapped after cleaning. The output shows generic column names like '_', '_2', etc. I need to revert to using the original column names from the `df.info()` output and manually specify which ones should be treated as numeric or categorical based on their likely content, then perform the conversion and descriptive statistics. I will also identify categorical columns based on the provided information and prepare for visualizing their distributions.



In [None]:
# Re-load data to get original column names
df = get_data()

# Identify columns based on original names and their likely data types
# Referencing the original `df.info()` output
original_numeric_cols = [
    'возраст на момент операции', 'ФВЛЖ', 'ИМТ кг/м2',
    'СД (Да=1/Нет=0)', 'пикс сколько раз', 'НАТРИЙ-УРЕТИЧЕСКИЙ ПЕПТИД ПГ/МЛ;',
    'СДЛАесли N-то 25, если больше то цифра', 'КДО', 'общий белок', 'мочевина',
    'креатинин ммоль/л', 'скф мл/мин', 'гемоглобин г/л', 'курение(сигарет в день',
    'EuroSCORE II(в %)', 'Исход' # Assuming 'Исход' is also numeric (0/1)
]

original_categorical_cols = [
    'пол', 'Группа (с ИК , без ИК)', 'Экстренноепплан=1, экстр.=0',
    'НРС Фп или пароксизмы', 'ИБС(ФК)', 'хсн стадия', 'ХСН(ФК)',
    'церебральный атеросклероз', 'ХОБЛ(да/нет)', 'ХБП(стадия по CKD-EPI)',
    'NYHA класс(1,2,3,4)', 'поражение сосоудов н/к', 'аневризма ЛЖ',
    'мини доступ', 'Тип операции (Решение)', 'Тип кардиоплегии',
    'Искусственное кровообращение', 'Гипотермия', 'Планируется гемотрансфузия'
]

# Convert numerical columns to numeric, coercing errors
for col in original_numeric_cols:
     if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
     else:
        print(f"Warning: Column '{col}' not found in DataFrame for numeric conversion.")


# Display basic statistics for numerical columns (Step 2)
# Filter for columns that were successfully converted to numeric
actual_numeric_cols = [col for col in original_numeric_cols if col in df.columns and pd.api.types.is_numeric_dtype(df[col])]

if actual_numeric_cols:
    print("Descriptive statistics for numerical columns:")
    display(df[actual_numeric_cols].describe())
else:
    print("No numeric columns found after conversion.")

# Identify columns with missing values (Step 7)
print("\nMissing values per column:")
display(df.isnull().sum())

# Check the distribution of the target variable (part of step 7 and preparation for step 4)
if 'Исход' in df.columns:
    print("\nDistribution of 'Исход':")
    display(df['Исход'].value_counts(dropna=False))
else:
    print("\n'Исход' column not found.")

# Filter for categorical columns that exist in the DataFrame
actual_categorical_cols = [col for col in original_categorical_cols if col in df.columns]

# Display value counts for categorical columns (part of Step 5)
print("\nValue counts for categorical columns:")
for col in actual_categorical_cols:
    print(f"\n{col}:")
    display(df[col].value_counts(dropna=False))

Descriptive statistics for numerical columns:


Unnamed: 0,возраст на момент операции,ФВЛЖ,ИМТ кг/м2,СД (Да=1/Нет=0),пикс сколько раз,НАТРИЙ-УРЕТИЧЕСКИЙ ПЕПТИД ПГ/МЛ;,"СДЛАесли N-то 25, если больше то цифра",КДО,общий белок,мочевина,креатинин ммоль/л,скф мл/мин,гемоглобин г/л,курение(сигарет в день,EuroSCORE II(в %),Исход
count,13.0,15.0,11.0,0.0,11.0,11.0,4.0,15.0,15.0,15.0,11.0,11.0,7.0,10.0,0.0,0.0
mean,23.384615,20.0,9.545455,,0.181818,10.909091,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,,
std,36.509219,29.277002,14.908814,,0.40452,24.271195,0.0,0.0,0.0,0.0,0.0,0.0,0.48795,0.0,,
min,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
25%,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
50%,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
75%,76.0,60.0,20.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,,
max,76.0,60.0,45.0,,1.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,



Missing values per column:


Unnamed: 0,0
ID(пациент),0
возраст на момент операции,2
ФВЛЖ,0
"Группа (с ИК , без ИК)",0
дата рождения,0
пол,0
ИМТ кг/м2,4
"Экстренноепплан=1, экстр.=0",0
дата операц,0
СД (Да=1/Нет=0),15



Distribution of 'Исход':


Unnamed: 0_level_0,count
Исход,Unnamed: 1_level_1
,15



Value counts for categorical columns:

пол:


Unnamed: 0_level_0,count
пол,Unnamed: 1_level_1
,7
М,4
0,4



Группа (с ИК , без ИК):


Unnamed: 0_level_0,count
"Группа (с ИК , без ИК)",Unnamed: 1_level_1
,11
ИК,4



Экстренноепплан=1, экстр.=0:


Unnamed: 0_level_0,count
"Экстренноепплан=1, экстр.=0",Unnamed: 1_level_1
,7
1.0,4
,4



НРС Фп или пароксизмы:


Unnamed: 0_level_0,count
НРС Фп или пароксизмы,Unnamed: 1_level_1
,9
0,4
ФП,2



ИБС(ФК):


Unnamed: 0_level_0,count
ИБС(ФК),Unnamed: 1_level_1
,9
,4
I,2



хсн стадия:


Unnamed: 0_level_0,count
хсн стадия,Unnamed: 1_level_1
,9
,4
2.0,2



ХСН(ФК):


Unnamed: 0_level_0,count
ХСН(ФК),Unnamed: 1_level_1
,13
1.0,2



церебральный атеросклероз:


Unnamed: 0_level_0,count
церебральный атеросклероз,Unnamed: 1_level_1
,9
0,4
Да,2



ХОБЛ(да/нет):


Unnamed: 0_level_0,count
ХОБЛ(да/нет),Unnamed: 1_level_1
,11
0.0,4



ХБП(стадия по CKD-EPI):


Unnamed: 0_level_0,count
ХБП(стадия по CKD-EPI),Unnamed: 1_level_1
0,15



NYHA класс(1,2,3,4):


Unnamed: 0_level_0,count
"NYHA класс(1,2,3,4)",Unnamed: 1_level_1
0,15



поражение сосоудов н/к:


Unnamed: 0_level_0,count
поражение сосоудов н/к,Unnamed: 1_level_1
,9
,4
Да,2



аневризма ЛЖ:


Unnamed: 0_level_0,count
аневризма ЛЖ,Unnamed: 1_level_1
,11
,4



мини доступ:


Unnamed: 0_level_0,count
мини доступ,Unnamed: 1_level_1
,10
,4
АКШ с ИК,1



Тип операции (Решение):


Unnamed: 0_level_0,count
Тип операции (Решение),Unnamed: 1_level_1
,15



Тип кардиоплегии:


Unnamed: 0_level_0,count
Тип кардиоплегии,Unnamed: 1_level_1
,15



Искусственное кровообращение:


Unnamed: 0_level_0,count
Искусственное кровообращение,Unnamed: 1_level_1
,15



Гипотермия:


Unnamed: 0_level_0,count
Гипотермия,Unnamed: 1_level_1
,15



Планируется гемотрансфузия:


Unnamed: 0_level_0,count
Планируется гемотрансфузия,Unnamed: 1_level_1
,15


## Подготовка данных

### Subtask:
Очистить данные от пропущенных значений и некорректных записей, преобразовать категориальные признаки (если есть) в числовой формат.


**Reasoning**:
Load the data using the provided function and define the columns to be converted to numeric types based on the previous analysis and the Gradio interface structure.



In [None]:
df = get_data()

# Define columns expected to be numeric based on the Gradio interface and previous analysis
# Using names that *should* align with the sheet based on the Gradio interface definition
numeric_cols_for_cleaning = [
    "Возраст", "ФВ ЛЖ", "ИМТ", "СД", "Креатинин", "СКФ", "EuroSCORE II", "Исход",
    "Кол-во пароксизмов", "Курение (пачка/лет)", "Натрий-уретический пептид",
    "Мочевина", "Общий белок", "Гемоглобин", "СДЛА", "КДО" # Adding other potentially numeric fields from Gradio
]

# Some column names in the sheet might differ slightly from the Gradio interface labels.
# Based on the previous analysis, let's map Gradio labels to potential sheet column names.
# This mapping is an educated guess based on the previous output and Gradio labels.
# We will iterate through sheet columns to find the match.
sheet_to_numeric_map = {
    "Возраст": ["возраст на момент операции"],
    "ФВ ЛЖ": ["ФВЛЖ"],
    "ИМТ": ["ИМТ кг/м2"],
    "СД": ["СД (Да=1/Нет=0)"],
    "Креатинин": ["креатинин ммоль/л"],
    "СКФ": ["скф мл/мин"],
    "EuroSCORE II": ["EuroSCORE II(в %)"],
    "Исход": ["Исход"],
    "Кол-во пароксизмов": ["пикс сколько раз"],
    "Курение (пачка/лет)": ["курение(сигарет в день"],
    "Натрий-уретический пептид": ["НАТРИЙ-УРЕТИЧЕСКИЙ ПЕПТИД ПГ/МЛ;"],
    "Мочевина": ["мочевина"],
    "Общий белок": ["общий белок"],
    "Гемоглобин": ["гемоглобин г/л"],
    "СДЛА": ["СДЛАесли N-то 25, если больше то цифра"], # This one is tricky
    "КДО": ["КДО"]
}

# Convert columns to numeric using the potential sheet names
converted_numeric_cols = []
for gradio_label, potential_sheet_names in sheet_to_numeric_map.items():
    for sheet_col in potential_sheet_names:
        if sheet_col in df.columns:
            df[sheet_col] = pd.to_numeric(df[sheet_col], errors='coerce')
            converted_numeric_cols.append(sheet_col)
            print(f"Attempted conversion for '{sheet_col}' (Gradio: '{gradio_label}')")
        else:
             print(f"Warning: Potential sheet column '{sheet_col}' (for Gradio label '{gradio_label}') not found.")


# Print info and missing values after initial conversion
df.info()
print("\nMissing values after initial numeric conversion:")
display(df[converted_numeric_cols].isnull().sum())

Attempted conversion for 'возраст на момент операции' (Gradio: 'Возраст')
Attempted conversion for 'ФВЛЖ' (Gradio: 'ФВ ЛЖ')
Attempted conversion for 'ИМТ кг/м2' (Gradio: 'ИМТ')
Attempted conversion for 'СД (Да=1/Нет=0)' (Gradio: 'СД')
Attempted conversion for 'креатинин ммоль/л' (Gradio: 'Креатинин')
Attempted conversion for 'скф мл/мин' (Gradio: 'СКФ')
Attempted conversion for 'EuroSCORE II(в %)' (Gradio: 'EuroSCORE II')
Attempted conversion for 'Исход' (Gradio: 'Исход')
Attempted conversion for 'пикс сколько раз' (Gradio: 'Кол-во пароксизмов')
Attempted conversion for 'курение(сигарет в день' (Gradio: 'Курение (пачка/лет)')
Attempted conversion for 'НАТРИЙ-УРЕТИЧЕСКИЙ ПЕПТИД ПГ/МЛ;' (Gradio: 'Натрий-уретический пептид')
Attempted conversion for 'мочевина' (Gradio: 'Мочевина')
Attempted conversion for 'общий белок' (Gradio: 'Общий белок')
Attempted conversion for 'гемоглобин г/л' (Gradio: 'Гемоглобин')
Attempted conversion for 'СДЛАесли N-то 25, если больше то цифра' (Gradio: 'СДЛА')


Unnamed: 0,0
возраст на момент операции,2
ФВЛЖ,0
ИМТ кг/м2,4
СД (Да=1/Нет=0),15
креатинин ммоль/л,4
скф мл/мин,4
EuroSCORE II(в %),15
Исход,15
пикс сколько раз,4
курение(сигарет в день,5


**Reasoning**:
Based on the previous output, the 'Исход', 'СД (Да=1/Нет=0)', and 'EuroSCORE II(в %)' columns are entirely missing after conversion. The instructions state to drop rows where the target variable 'Исход' is NaN. This will result in an empty DataFrame. I need to implement this step and handle the case of an empty DataFrame as per the instructions.



In [None]:
# Step 4: Drop rows where the target variable "Исход" is NaN.
initial_rows = len(df)
df_cleaned = df.dropna(subset=['Исход']).copy()

if df_cleaned.empty:
    print(f"Dropping rows with missing 'Исход' resulted in an empty DataFrame.")
    print("Cannot proceed with further data preparation steps like imputation or one-hot encoding.")
    # We will stop here as the primary goal (preparing data for a model using 'Исход') cannot be met.
else:
    print(f"Dropped {initial_rows - len(df_cleaned)} rows with missing 'Исход'. Remaining rows: {len(df_cleaned)}")

    # Steps 5, 6, 7, 8, 9 would follow here if df_cleaned was not empty.
    # Since it is empty, these steps are skipped.

# Print info and missing values of the resulting DataFrame (which is empty)
df_cleaned.info()
print("\nMissing values in cleaned DataFrame (should be empty):")
display(df_cleaned.isnull().sum())

Dropping rows with missing 'Исход' resulted in an empty DataFrame.
Cannot proceed with further data preparation steps like imputation or one-hot encoding.
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 40 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   ID(пациент)                             0 non-null      object 
 1   возраст на момент операции              0 non-null      float64
 2   ФВЛЖ                                    0 non-null      int64  
 3   Группа (с ИК , без ИК)                  0 non-null      object 
 4   дата рождения                           0 non-null      object 
 5   пол                                     0 non-null      object 
 6   ИМТ кг/м2                               0 non-null      float64
 7   Экстренноепплан=1, экстр.=0             0 non-null      object 
 8   дата операц                             0 non-null      object

Unnamed: 0,0
ID(пациент),0
возраст на момент операции,0
ФВЛЖ,0
"Группа (с ИК , без ИК)",0
дата рождения,0
пол,0
ИМТ кг/м2,0
"Экстренноепплан=1, экстр.=0",0
дата операц,0
СД (Да=1/Нет=0),0


## Выбор признаков

### Subtask:
Определить наиболее значимые признаки для обучения модели.


**Reasoning**:
Based on the previous analysis, the dataset has critical data quality issues, especially with the target variable 'Исход' being entirely missing. Data-driven feature selection is not possible. Therefore, I will define the list of features for the model based on the predefined list from the Gradio interface inputs as instructed.



In [None]:
# Define the list of features for the model based on the Gradio interface inputs
features_for_model = ["Возраст", "ФВ ЛЖ", "ИМТ", "СД", "Креатинин", "СКФ", "EuroSCORE II"]

print("Features selected for the model based on Gradio interface inputs:")
print(features_for_model)

# Acknowledge the reason for this selection
print("\nNote: Feature selection is based purely on the predefined set from the Gradio interface due to data limitations,")
print("and not on an empirical analysis of feature importance from the provided dataset.")

Features selected for the model based on Gradio interface inputs:
['Возраст', 'ФВ ЛЖ', 'ИМТ', 'СД', 'Креатинин', 'СКФ', 'EuroSCORE II']

Note: Feature selection is based purely on the predefined set from the Gradio interface due to data limitations,
and not on an empirical analysis of feature importance from the provided dataset.


## Выбор модели

### Subtask:
Рассмотреть другие модели машинного обучения помимо логистической регрессии, которые могут быть более подходящими для данной задачи.


**Reasoning**:
Review the problem type and consider alternative classification algorithms suitable for binary outcomes, then select one or two and justify the choice.



In [None]:
# 1. Review the problem type: predicting a binary outcome ("Исход").
print("Problem Type: Binary Classification (Predicting 'Исход' - Favorable/Unfavorable Outcome)")

# 2. Consider common classification algorithms suitable for potentially limited datasets and medical contexts.
print("\nConsidering alternative classification algorithms:")
print("- Support Vector Machines (SVM): Can work well in high-dimensional spaces and with limited data, but sensitive to feature scaling.")
print("- Random Forest: Ensemble method, generally robust, can handle non-linear relationships and feature interactions. Less prone to overfitting than decision trees.")
print("- Gradient Boosting (e.g., LightGBM, XGBoost): Powerful ensemble method, often provides high accuracy, but can be more complex and prone to overfitting if not tuned properly.")
print("- Naive Bayes: Simple, fast, works well with limited data and high dimensions, assumes independence of features (often violated in practice).")
print("- K-Nearest Neighbors (KNN): Simple, instance-based, sensitive to scaling and irrelevant features.")

# 3. Briefly research or recall characteristics (done above).

# 4. Select one or two alternative models.
# Given the potential for non-linear relationships in medical data and the robustness of ensemble methods,
# Random Forest is a strong candidate. It's less sensitive to feature scaling than SVM and more interpretable
# than complex gradient boosting models, which is valuable in a medical context.
# Gradient Boosting (like LightGBM) is another strong candidate for its potential high performance,
# although interpretability might be lower.
# For this exercise, let's select Random Forest and mention Gradient Boosting as another viable option.

selected_models = ["Random Forest", "Gradient Boosting (e.g., LightGBM)"]
print(f"\nSelected alternative models for consideration: {selected_models}")

# 5. State the selected alternative model(s) and provide a brief justification.
print("\nJustification:")
print(f"- {selected_models[0]}: Random Forest is an ensemble method that can capture non-linear relationships and interactions between features. It is generally robust to noise and outliers and less prone to overfitting than single decision trees. Its ensemble nature provides more stable predictions. In a medical context, while a black box compared to simpler models, it's often more interpretable than deep learning or complex boosting models.")
print(f"- {selected_models[1]}: Gradient Boosting models like LightGBM are known for achieving high predictive accuracy on various tasks. They are powerful in capturing complex patterns in data. While potentially less interpretable than Random Forest, they are a strong candidate for maximizing predictive performance.")

print("\nNote: The ultimate choice and performance comparison would require sufficient and clean data for empirical evaluation.")

Problem Type: Binary Classification (Predicting 'Исход' - Favorable/Unfavorable Outcome)

Considering alternative classification algorithms:
- Support Vector Machines (SVM): Can work well in high-dimensional spaces and with limited data, but sensitive to feature scaling.
- Random Forest: Ensemble method, generally robust, can handle non-linear relationships and feature interactions. Less prone to overfitting than decision trees.
- Gradient Boosting (e.g., LightGBM, XGBoost): Powerful ensemble method, often provides high accuracy, but can be more complex and prone to overfitting if not tuned properly.
- Naive Bayes: Simple, fast, works well with limited data and high dimensions, assumes independence of features (often violated in practice).
- K-Nearest Neighbors (KNN): Simple, instance-based, sensitive to scaling and irrelevant features.

Selected alternative models for consideration: ['Random Forest', 'Gradient Boosting (e.g., LightGBM)']

Justification:
- Random Forest: Random Forest 

## Обучение модели

### Subtask:
Обучить выбранную модель на подготовленных данных.


**Reasoning**:
The subtask is to train a RandomForestClassifier model. This requires loading the data, cleaning the necessary columns, dropping rows with missing values in the training columns and target, splitting into features and target, checking for sufficient data, importing the model, instantiating it, training it, and saving it globally. These steps can be grouped into a single code block.



In [None]:
from sklearn.ensemble import RandomForestClassifier

# 1. Load the data
df = get_data()

# 2. Define columns for training and target based on the Gradio interface and previous selection
X_columns_sheet = ["возраст на момент операции", "ФВЛЖ", "ИМТ кг/м2", "СД (Да=1/Нет=0)", "креатинин ммоль/л", "скф мл/мин", "EuroSCORE II(в %)"]
y_column_sheet = "Исход"
training_columns = X_columns_sheet + [y_column_sheet]

# 3. Clean and prepare data: Convert training columns to numeric, coercing errors
df_cleaned = df.copy()
for col in training_columns:
    if col in df_cleaned.columns:
        df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
        print(f"Attempted numeric conversion for '{col}'")
    else:
        print(f"Warning: Training column '{col}' not found in DataFrame.")

# Filter for columns that actually exist and were attempted for conversion
actual_training_columns = [col for col in training_columns if col in df_cleaned.columns]

# 4. Drop rows with missing values in any of the training columns (including target)
df_cleaned = df_cleaned.dropna(subset=actual_training_columns)

print(f"\nRows remaining after dropping NaNs in training columns: {len(df_cleaned)}")
df_cleaned.info()
print("\nMissing values in relevant columns after dropping NaNs:")
display(df_cleaned[actual_training_columns].isnull().sum())


# 5. Check if enough data remains for training
if len(df_cleaned) < 10:
    training_status = f"Недостаточно данных для обучения после очистки (осталось {len(df_cleaned)} полных записей, требуется минимум 10)."
    print(training_status)
    model = None # Ensure model is None if training fails
else:
    # 6. Separate features (X) and target (y)
    X = df_cleaned[X_columns_sheet]
    y = df_cleaned[y_column_sheet]

    # 7. Import RandomForestClassifier (already done in the first line of this block)

    # 8. Create an instance of RandomForestClassifier
    model = RandomForestClassifier(random_state=42) # Using a random state for reproducibility

    # 9. Train the model
    model.fit(X, y)

    # 10. Output success message
    training_status = f"Модель RandomForestClassifier обучена на {len(df_cleaned)} пациентах."
    print(training_status)


Attempted numeric conversion for 'возраст на момент операции'
Attempted numeric conversion for 'ФВЛЖ'
Attempted numeric conversion for 'ИМТ кг/м2'
Attempted numeric conversion for 'СД (Да=1/Нет=0)'
Attempted numeric conversion for 'креатинин ммоль/л'
Attempted numeric conversion for 'скф мл/мин'
Attempted numeric conversion for 'EuroSCORE II(в %)'
Attempted numeric conversion for 'Исход'

Rows remaining after dropping NaNs in training columns: 0
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 40 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   ID(пациент)                             0 non-null      object 
 1   возраст на момент операции              0 non-null      float64
 2   ФВЛЖ                                    0 non-null      int64  
 3   Группа (с ИК , без ИК)                  0 non-null      object 
 4   дата рождения                           0 n

Unnamed: 0,0
возраст на момент операции,0
ФВЛЖ,0
ИМТ кг/м2,0
СД (Да=1/Нет=0),0
креатинин ммоль/л,0
скф мл/мин,0
EuroSCORE II(в %),0
Исход,0


Недостаточно данных для обучения после очистки (осталось 0 полных записей, требуется минимум 10).


## Оценка модели

### Subtask:
Оценка модели


**Reasoning**:
Check if the model is trained and if so, perform cross-validation to evaluate its performance using relevant metrics.



In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import warnings

# Suppress potential warnings from cross_val_score on small datasets
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')


# 1. Check if the model is trained
if model is None:
    print("Модель не была обучена из-за недостатка данных. Оценка невозможна.")
else:
    print("Модель обучена. Выполняется оценка с использованием кросс-валидации.")

    # 2. Define features (X) and target (y) using the cleaned data from the training step
    # We need to re-create df_cleaned as it was local to the previous cell.
    # Re-loading and re-cleaning the data to ensure we have the correct X and y.
    df = get_data()
    X_columns_sheet = ["возраст на момент операции", "ФВЛЖ", "ИМТ кг/м2", "СД (Да=1/Нет=0)", "креатинин ммоль/л", "скф мл/мин", "EuroSCORE II(в %)"]
    y_column_sheet = "Исход"
    training_columns = X_columns_sheet + [y_column_sheet]

    df_cleaned = df.copy()
    for col in training_columns:
        if col in df_cleaned.columns:
            df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
        # Warning already printed in training step if column missing

    df_cleaned = df_cleaned.dropna(subset=training_columns) # Use training_columns as subset includes y

    if len(df_cleaned) < 5: # Cross-validation typically requires at least 2 samples per fold, 5 is a safer minimum for folds=5
        print(f"Недостаточно данных ({len(df_cleaned)} записей) для кросс-валидации после очистки. Оценка невозможна.")
    else:
        X = df_cleaned[X_columns_sheet]
        y = df_cleaned[y_column_sheet]

        # 3. Perform cross-validation
        scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
        cv_results = {}

        print("\nВыполнение кросс-валидации (5 фолдов):")
        try:
            for metric in scoring:
                # Use cross_val_score
                scores = cross_val_score(model, X, y, cv=5, scoring=metric)
                cv_results[metric] = scores
                print(f"{metric.capitalize()} scores: {scores}")
                print(f"Mean {metric.capitalize()}: {scores.mean():.4f} (+/- {scores.std():.4f})")
            print("\nКросс-валидация завершена.")

            # 4. Print evaluation results (mean and std)
            print("\nИтоговые результаты кросс-валидации (среднее +/- стандартное отклонение):")
            for metric, scores in cv_results.items():
                 print(f"{metric.capitalize()}: {scores.mean():.4f} (+/- {scores.std():.4f})")

            # 5. Acknowledge limitations
            print("\nОграничения оценки:")
            print("- Размер датасета очень мал, что делает результаты кросс-валидации менее надежными.")
            print("- Отсутствует отдельный тестовый набор данных для независимой оценки.")
            print("- Дисбаланс классов в целевой переменной может влиять на метрики (особенно precision/recall/f1/roc_auc).")

        except Exception as e:
            print(f"\nПроизошла ошибка при выполнении кросс-валидации: {e}")
            print("Проверьте данные и убедитесь, что они подходят для выбранных метрик и модели.")


Модель не была обучена из-за недостатка данных. Оценка невозможна.


## Настройка гиперпараметров

### Subtask:
Настроить гиперпараметры выбранной модели для улучшения ее производительности.


## Интерпретация результатов

### Subtask:
Проанализировать, какие признаки оказывают наибольшее влияние на прогноз модели.


**Reasoning**:
Check if the model is trained and if it's a RandomForestClassifier to get feature importances.



In [None]:
if model is None:
    print("Модель не обучена. Невозможно интерпретировать важность признаков.")
elif not hasattr(model, 'feature_importances_'):
    print(f"Модель типа {type(model).__name__} не поддерживает атрибут feature_importances_. Невозможно интерпретировать важность признаков.")
else:
    # Get feature importances
    importances = model.feature_importances_

    # Get the list of feature names used during training
    # This list was defined in the 'Обучение модели' subtask
    X_columns_sheet = ["возраст на момент операции", "ФВЛЖ", "ИМТ кг/м2", "СД (Да=1/Нет=0)", "креатинин ммоль/л", "скф мл/мин", "EuroSCORE II(в %)"]

    # Create a dictionary mapping feature names to importances
    feature_importance_dict = dict(zip(X_columns_sheet, importances))

    # Sort features by importance in descending order
    sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True)

    # Print the sorted feature importances
    print("Важность признаков (согласно RandomForestClassifier):")
    for feature, importance in sorted_feature_importance:
        print(f"- {feature}: {importance:.4f}")

    # Provide a brief commentary
    print("\nКомментарий:")
    if sorted_feature_importance:
        most_important_feature = sorted_feature_importance[0][0]
        print(f"Согласно текущей модели RandomForestClassifier, наиболее важным признаком для прогноза является '{most_important_feature}'.")
        print("Однако, следует учитывать, что модель была обучена на очень ограниченном наборе данных, что может сделать эти результаты нерепрезентативными или ненадежными.")
    else:
        print("Не удалось определить важность признаков.")

Модель не обучена. Невозможно интерпретировать важность признаков.


## Интеграция с интерфейсом

### Subtask:
Обновить функцию прогнозирования в интерфейсе Gradio, если была выбрана другая модель или изменены признаки.


**Reasoning**:
Update the `predict_outcome` function in the Gradio interface code to use the global `model` variable, handle potential `RandomForestClassifier` model, and adjust the probability prediction method.



In [None]:
# Find the existing Gradio interface definition code cell (based on cell_id: hhU5PjBgFcQS)
# and replace it with the following updated code.
# Ensure you copy the entire content of that cell and apply the changes below.

# --- Start of the updated Gradio interface code ---
# Интерфейс Gradio с блоками и всеми полями
css = """
body {
    background: linear-gradient(to right, #e0f2f7, #bbdefb); /* Light blue gradient */
    color: #0d47a1; /* Dark blue text */
    font-family: 'Arial', sans-serif;
}
h1 {
    color: #0d47a1; /* Dark blue header */
    text-align: center;
}
.gradio-container {
    border: 1px solid #e0f0f9; /* Light border */
    box-shadow: 0 4px 8px rgba(0,0,0,0.1); /* Subtle shadow */
    border-radius: 8px;
    background-color: #ffffff; /* White background for container */
}
.gradio-tabs {
    background-color: #e3f2fd; /* Lighter blue for tabs */
}
.gradio-tab {
    color: #1565c0; /* Blue tab text */
}
button {
    background-color: #1e88e5 !important; /* Blue buttons */
    color: white !important;
    border: none !important;
    border-radius: 4px !important;
    padding: 10px 20px !important;
    font-size: 16px !important;
    cursor: pointer !important;
    transition: background-color 0.3s ease !important;
}
button:hover {
    background-color: #1565c0 !important; /* Darker blue on hover */
}
label {
    font-weight: bold;
    color: #0d47a1; /* Dark blue labels */
}
input[type="text"], input[type="number"], textarea, select {
    border: 1px solid #bbdefb; /* Light blue border for inputs */
    border-radius: 4px;
    padding: 8px;
    box-sizing: border-box;
}
"""
with gr.Blocks(css=css) as demo:
    gr.Markdown("# Система поддержки принятия врачебных решений на основе машинного обучения")
    with gr.Tab("Общие данные"):
        id = gr.Text(label="ID пациента")
        age = gr.Number(label="Возраст")
        sex = gr.Radio(["М", "Ж"], label="Пол")
        bmi = gr.Number(label="ИМТ")
        emergency = gr.Radio([1, 0], label="Экстренность (1=план, 0=экстр)")
        surgery_date = gr.Text(label="Дата операции")
        cpb_group = gr.Radio(["ИК", "Без ИК"], label="Группа")
        diabetes = gr.Radio([1, 0], label="СД 2 типа")
        smoking = gr.Number(label="Курение (пачка/лет)")
        euroscore = gr.Number(label="EuroSCORE II")

    with gr.Tab("Система кровообращения"):
        ef = gr.Number(label="ФВ ЛЖ")
        af_type = gr.Radio(["ФП", "Пароксизмы", "Нет"], label="ФП / Пароксизмы")
        parox_count = gr.Number(label="Кол-во пароксизмов")
        angina_fc = gr.Radio(["I", "II", "III", "IV"], label="ФК ИБС")
        chf_stage = gr.Text(label="Стадия ХСН")
        chf_fc = gr.Radio(["1", "2", "3", "4"], label="ФК ХСН (NYHA)")
        cerebral_as = gr.Radio(["Да", "Нет"], label="Церебральный атеросклероз")
        bca_as = gr.Radio(["Да", "Нет"], label="Атеросклероз БЦА")
        leg_vessel_as = gr.Radio(["Да", "Нет"], label="Сосуды нижних конечностей")
        lv_aneurysm = gr.Radio(["Да", "Нет"], label="Аневризма ЛЖ")
    with gr.Tab("Лабораторные данные"):
        nt_pro_bnp = gr.Number(label="Натрий-уретический пептид")
        syst_pa = gr.Text(label="СДЛА")
        edv = gr.Number(label="КДО")
        protein = gr.Number(label="Общий белок")
        urea = gr.Number(label="Мочевина")
        creatinine = gr.Number(label="Креатинин")
        ckd_stage = gr.Text(label="ХБП стадия")
        gfr = gr.Number(label="СКФ")
        hemoglobin = gr.Number(label="Гемоглобин")
    with gr.Tab("Анализ"):
        output = gr.Textbox(label="Результат")
        save = gr.Button("💾 Сохранить пациента")
        train = gr.Button("🧠 Обучить модель")
        predict = gr.Button("📈 Прогноз")

    # Define the order of features expected by the model
    # This should match X_columns_sheet from the training step
    X_columns_sheet = ["возраст на момент операции", "ФВЛЖ", "ИМТ кг/м2", "СД (Да=1/Нет=0)", "креатинин ммоль/л", "скф мл/мин", "EuroSCORE II(в %)"]
    # Map Gradio inputs to the order expected by the model
    # Ensure this mapping is correct and includes all inputs needed for the model
    # The order here must match X_columns_sheet
    model_inputs_order = [age, ef, bmi, diabetes, creatinine, gfr, euroscore]


    # Update the predict_outcome function
    def predict_outcome(age, ef, bmi, diabetes, creatinine, gfr, euroscore):
        global model
        if model is None:
            return "Сначала обучите модель."
        # Ensure input is in correct format and handle potential errors
        try:
            # Ensure the input array is in the correct order as per X_columns_sheet
            # ["возраст на момент операции", "ФВЛЖ", "ИМТ кг/м2", "СД (Да=1/Нет=0)", "креатинин ммоль/л", "скф мл/мин", "EuroSCORE II(в %)"]
            x = np.array([[float(age), float(ef), float(bmi), float(diabetes), float(creatinine), float(gfr), float(euroscore)]])

            # Predict the probability, specifically for the positive class (index 1)
            # Use [:, 1] to handle cases where predict_proba might return a single row for a single sample
            prob = model.predict_proba(x)[:, 1]

            # Check if prob is a single value (as expected for a single input row)
            if prob.shape[0] > 0:
                 return f"Вероятность неблагоприятного исхода: {round(prob[0] * 100, 2)}%"
            else:
                 return "Не удалось получить предсказание вероятности."

        except ValueError:
            return "Ошибка: Введите числовые значения для прогноза."
        except Exception as e:
            return f"Произошла ошибка при прогнозировании: {e}"


    inputs = model_inputs_order # Use the defined order for inputs to the predict function

    # Update the save_patient function to accept all inputs from the interface tabs
    # Ensure the order of inputs here matches the order used to zip with sheet_columns
    # The order of inputs to save_patient needs to match the order sheet_columns are expected
    # Re-check the order of elements being passed to save_patient
    save.click(lambda *vals: save_patient(dict(zip(sheet_columns, vals))),
              inputs=[id, age, sex, bmi, emergency, surgery_date, cpb_group, diabetes, smoking, euroscore, # General data
                      ef, af_type, parox_count, angina_fc, chf_stage, chf_fc, cerebral_as, bca_as, leg_vessel_as, lv_aneurysm, # Systemic circulation
                      nt_pro_bnp, syst_pa, edv, protein, urea, creatinine, ckd_stage, gfr, hemoglobin], # Lab data
                      # Ensure all other fields from sheet_columns are included here even if not in tabs,
                      # or handle missing keys in save_patient. Let's ensure all are passed as None if not in tabs
                      # based on the original save.click definition, the inputs list had more items.
                      # Let's revert to the original inputs list for save.click and check sheet_columns mapping again
                      # Original inputs list from the notebook cell:
                      # [id, age, ef, cpb_group, sex, bmi, emergency, surgery_date, diabetes, af_type, parox_count, angina_fc, chf_stage, chf_fc, cerebral_as, nt_pro_bnp, syst_pa, edv, protein, urea, creatinine, ckd_stage, gfr, hemoglobin, chf_fc, smoking, euroscore, leg_vessel_as, lv_aneurysm]
                      # This list is different from the fields defined in the interface tabs and also seems to have chf_fc twice.
                      # Let's redefine the inputs for save.click based on the actual UI fields defined and ensure they are passed in an order that can be mapped to sheet_columns.
                      # A safer approach is to construct the dictionary inside the lambda, mapping UI component to sheet column name.
                      # This requires knowing the exact mapping between UI components and sheet columns.
                      # Based on the save_patient function and sheet_columns, let's map the UI components to the sheet columns.
                      # This mapping needs to be accurate based on the Google Sheet header.
                      # Let's assume sheet_columns is in the order:
                      # ['ID(пациент)', 'возраст на момент операции', 'ФВЛЖ', 'Группа (с ИК , без ИК)', 'дата рождения', 'пол', 'ИМТ кг/м2', 'Экстренноепплан=1, экстр.=0', 'дата операц', 'СД (Да=1/Нет=0)', 'НРС Фп или пароксизмы', 'пикс сколько раз', 'ИБС(ФК)', 'хсн стадия', 'ХСН(ФК)', 'церебральный атеросклероз', 'НАТРИЙ-УРЕТИЧЕСКИЙ ПЕПТИД ПГ/МЛ;', 'СДЛАесли N-то 25, если больше то цифра', 'КДО', 'общий белок', 'мочевина', 'креатинин ммоль/л', 'ХОБЛ(да/нет)', 'ХБП(стадия по CKD-EPI)', 'скф мл/мин', 'гемоглобин г/л', 'NYHA класс(1,2,3,4)', 'курение(сигарет в день', 'EuroSCORE II(в %)', 'Хирург', 'Примечания', 'поражение сосоудов н/к', 'аневризма ЛЖ', 'мини доступ', 'Исход', 'Тип операции (Решение)', 'Тип кардиоплегии', 'Искусственное кровообращение', 'Гипотермия', 'Планируется гемотрансфузия']
                      # We need to collect inputs for these columns from the UI. Some UI fields map directly, others don't exist in UI.
                      # Let's collect all UI inputs and map them to the correct sheet columns in the lambda.

                      # Define a dictionary mapping UI component variable names to their corresponding sheet column names
                      # This is a crucial mapping and must be accurate
                      ui_to_sheet_mapping = {
                          id: 'ID(пациент)',
                          age: 'возраст на момент операции',
                          ef: 'ФВЛЖ',
                          sex: 'пол',
                          bmi: 'ИМТ кг/м2',
                          emergency: 'Экстренноепплан=1, экстр.=0',
                          surgery_date: 'дата операц',
                          cpb_group: 'Группа (с ИК , без ИК)',
                          diabetes: 'СД (Да=1/Нет=0)',
                          af_type: 'НРС Фп или пароксизмы',
                          parox_count: 'пикс сколько раз',
                          angina_fc: 'ИБС(ФК)',
                          chf_stage: 'хсн стадия',
                          chf_fc: 'ХСН(ФК)', # Note: UI has chf_fc, sheet has XCH(ФК)
                          cerebral_as: 'церебральный атеросклероз',
                          nt_pro_bnp: 'НАТРИЙ-УРЕТИЧЕСКИЙ ПЕПТИД ПГ/МЛ;',
                          syst_pa: 'СДЛАесли N-то 25, если больше то цифра',
                          edv: 'КДО',
                          protein: 'общий белок',
                          urea: 'мочевина',
                          creatinine: 'креатинин ммоль/л',
                          ckd_stage: 'ХБП(стадия по CKD-EPI)', # Note: UI has ckd_stage, sheet has XBP(стадия по CKD-EPI)
                          gfr: 'скф мл/мин',
                          hemoglobin: 'гемоглобин г/л',
                          # NYHA class is chf_fc in UI, mapped above
                          smoking: 'курение(сигарет в день',
                          euroscore: 'EuroSCORE II(в %)',
                          leg_vessel_as: 'поражение сосоудов н/к',
                          lv_aneurysm: 'аневризма ЛЖ'
                           # Other sheet columns ('дата рождения', 'ХОБЛ(да/нет)', 'NYHA класс(1,2,3,4)', 'Хирург', 'Примечания', 'мини доступ', 'Исход', 'Тип операции (Решение)', 'Тип кардиоплегии', 'Искусственное кровообращение', 'Гипотермия', 'Планируется гемотрансфузия') are not directly mapped from UI fields, save_patient needs to handle missing keys.
                      }


                      # Create a list of all UI input components
                      all_ui_inputs = [id, age, sex, bmi, emergency, surgery_date, cpb_group, diabetes, smoking, euroscore,
                                       ef, af_type, parox_count, angina_fc, chf_stage, chf_fc, cerebral_as, bca_as, leg_vessel_as, lv_aneurysm,
                                       nt_pro_bnp, syst_pa, edv, protein, urea, creatinine, ckd_stage, gfr, hemoglobin] # Exclude bca_as from mapping if it's not in sheet_columns, but keep in UI list for completeness if needed elsewhere. Let's re-check sheet_columns.
                      # bca_as is not in sheet_columns in the prompt's variable list. Remove it from the UI input list for saving.
                      all_ui_inputs_for_save = [id, age, sex, bmi, emergency, surgery_date, cpb_group, diabetes, smoking, euroscore,
                                       ef, af_type, parox_count, angina_fc, chf_stage, chf_fc, cerebral_as, leg_vessel_as, lv_aneurysm, # bca_as removed
                                       nt_pro_bnp, syst_pa, edv, protein, urea, creatinine, ckd_stage, gfr, hemoglobin]

                      # The lambda function will now correctly map the input values to the sheet column names
                      # It needs to iterate through sheet_columns and find the corresponding value from the UI inputs
                      # This requires the values passed to the lambda (*vals) to be ordered correctly relative to all_ui_inputs_for_save.
                      # Let's create a dictionary from the input values and then map to sheet columns.

    def save_patient_from_ui(*ui_values):
        # Create a dictionary from the UI input components and their values
        ui_values_dict = dict(zip(all_ui_inputs_for_save, ui_values))

        # Create the data_dict for save_patient, mapping sheet column names to UI values
        data_dict = {}
        for sheet_col in sheet_columns:
            # Find the corresponding UI component using the mapping
            ui_component = None
            for ui_comp, sheet_name in ui_to_sheet_mapping.items():
                if sheet_name == sheet_col:
                    ui_component = ui_comp
                    break

            if ui_component in ui_values_dict:
                 data_dict[sheet_col] = ui_values_dict[ui_component]
            # Handle other sheet columns that don't have a direct UI input
            # They will simply not be in data_dict if not mapped, and save_patient handles missing keys.

        return save_patient(data_dict)


    save.click(fn=save_patient_from_ui,
               inputs=all_ui_inputs_for_save, # Pass all relevant UI input components
               outputs=output)

    train.click(fn=train_model, outputs=output)
    predict.click(fn=predict_outcome, inputs=model_inputs_order, outputs=output) # Use the correct input order for prediction

demo.launch()

# --- End of the updated Gradio interface code ---

SyntaxError: '(' was never closed (ipython-input-25-538939924.py, line 136)

**Reasoning**:
The previous command failed due to a `SyntaxError` in the `save.click` definition within the Gradio interface code. The error message "unclosed parenthesis" indicates a problem with the lambda function or its arguments/parentheses. I need to fix the syntax of the `save.click` call, ensuring the lambda function is correctly defined and the inputs are properly handled. I will regenerate the entire Gradio interface code block with the corrected syntax for the `save.click` call, while keeping the updates to the `predict_outcome` function as planned in the previous step.



In [None]:
# Find the existing Gradio interface definition code cell (based on cell_id: hhU5PjBgFcQS)
# and replace it with the following corrected code.
# Ensure you copy the entire content of that cell and apply the changes below.

# --- Start of the corrected Gradio interface code ---
# Интерфейс Gradio с блоками и всеми полями
css = """
body {
    background: linear-gradient(to right, #e0f2f7, #bbdefb); /* Light blue gradient */
    color: #0d47a1; /* Dark blue text */
    font-family: 'Arial', sans-serif;
}
h1 {
    color: #0d47a1; /* Dark blue header */
    text-align: center;
}
.gradio-container {
    border: 1px solid #e0f0f9; /* Light border */
    box-shadow: 0 4px 8px rgba(0,0,0,0.1); /* Subtle shadow */
    border-radius: 8px;
    background-color: #ffffff; /* White background for container */
}
.gradio-tabs {
    background-color: #e3f2fd; /* Lighter blue for tabs */
}
.gradio-tab {
    color: #1565c0; /* Blue tab text */
}
button {
    background-color: #1e88e5 !important; /* Blue buttons */
    color: white !important;
    border: none !important;
    border-radius: 4px !important;
    padding: 10px 20px !important;
    font-size: 16px !important;
    cursor: pointer !important;
    transition: background-color 0.3s ease !important;
}
button:hover {
    background-color: #1565c0 !important; /* Darker blue on hover */
}
label {
    font-weight: bold;
    color: #0d47a1; /* Dark blue labels */
}
input[type="text"], input[type="number"], textarea, select {
    border: 1px solid #bbdefb; /* Light blue border for inputs */
    border-radius: 4px;
    padding: 8px;
    box-sizing: border-box;
}
"""
with gr.Blocks(css=css) as demo:
    gr.Markdown("# Система поддержки принятия врачебных решений на основе машинного обучения")
    with gr.Tab("Общие данные"):
        id = gr.Text(label="ID пациента")
        age = gr.Number(label="Возраст")
        sex = gr.Radio(["М", "Ж"], label="Пол")
        bmi = gr.Number(label="ИМТ")
        emergency = gr.Radio([1, 0], label="Экстренность (1=план, 0=экстр)")
        surgery_date = gr.Text(label="Дата операции")
        cpb_group = gr.Radio(["ИК", "Без ИК"], label="Группа")
        diabetes = gr.Radio([1, 0], label="СД 2 типа")
        smoking = gr.Number(label="Курение (пачка/лет)")
        euroscore = gr.Number(label="EuroSCORE II")

    with gr.Tab("Система кровообращения"):
        ef = gr.Number(label="ФВ ЛЖ")
        af_type = gr.Radio(["ФП", "Пароксизмы", "Нет"], label="ФП / Пароксизмы")
        parox_count = gr.Number(label="Кол-во пароксизмов")
        angina_fc = gr.Radio(["I", "II", "III", "IV"], label="ФК ИБС")
        chf_stage = gr.Text(label="Стадия ХСН")
        chf_fc = gr.Radio(["1", "2", "3", "4"], label="ФК ХСН (NYHA)")
        cerebral_as = gr.Radio(["Да", "Нет"], label="Церебральный атеросклероз")
        bca_as = gr.Radio(["Да", "Нет"], label="Атеросклероз БЦА")
        leg_vessel_as = gr.Radio(["Да", "Нет"], label="Сосуды нижних конечностей")
        lv_aneurysm = gr.Radio(["Да", "Нет"], label="Аневризма ЛЖ")
    with gr.Tab("Лабораторные данные"):
        nt_pro_bnp = gr.Number(label="Натрий-уретический пептид")
        syst_pa = gr.Text(label="СДЛА")
        edv = gr.Number(label="КДО")
        protein = gr.Number(label="Общий белок")
        urea = gr.Number(label="Мочевина")
        creatinine = gr.Number(label="Креатинин")
        ckd_stage = gr.Text(label="ХБП стадия")
        gfr = gr.Number(label="СКФ")
        hemoglobin = gr.Number(label="Гемоглобин")
    with gr.Tab("Анализ"):
        output = gr.Textbox(label="Результат")
        save = gr.Button("💾 Сохранить пациента")
        train = gr.Button("🧠 Обучить модель")
        predict = gr.Button("📈 Прогноз")

    # Define the order of features expected by the model
    # This should match X_columns_sheet from the training step
    X_columns_sheet = ["возраст на момент операции", "ФВЛЖ", "ИМТ кг/м2", "СД (Да=1/Нет=0)", "креатинин ммоль/л", "скф мл/мин", "EuroSCORE II(в %)"]
    # Map Gradio inputs to the order expected by the model
    # Ensure this mapping is correct and includes all inputs needed for the model
    # The order here must match X_columns_sheet
    model_inputs_order = [age, ef, bmi, diabetes, creatinine, gfr, euroscore]


    # Update the predict_outcome function
    def predict_outcome(age, ef, bmi, diabetes, creatinine, gfr, euroscore):
        global model
        if model is None:
            return "Сначала обучите модель."
        # Ensure input is in correct format and handle potential errors
        try:
            # Ensure the input array is in the correct order as per X_columns_sheet
            # ["возраст на момент операции", "ФВЛЖ", "ИМТ кг/м2", "СД (Да=1/Нет=0)", "креатинин ммоль/л", "скф мл/мин", "EuroSCORE II(в %)"]
            x = np.array([[float(age), float(ef), float(bmi), float(diabetes), float(creatinine), float(gfr), float(euroscore)]])

            # Predict the probability, specifically for the positive class (index 1)
            # Use [:, 1] to handle cases where predict_proba might return a single row for a single sample
            prob = model.predict_proba(x)[:, 1]

            # Check if prob is a single value (as expected for a single input row)
            if prob.shape[0] > 0:
                 return f"Вероятность неблагоприятного исхода: {round(prob[0] * 100, 2)}%"
            else:
                 return "Не удалось получить предсказание вероятности."

        except ValueError:
            return "Ошибка: Введите числовые значения для прогноза."
        except Exception as e:
            return f"Произошла ошибка при прогнозировании: {e}"


    inputs = model_inputs_order # Use the defined order for inputs to the predict function

    # Define a dictionary mapping UI component variable names to their corresponding sheet column names
    # This is a crucial mapping and must be accurate
    ui_to_sheet_mapping = {
        id: 'ID(пациент)',
        age: 'возраст на момент операции',
        ef: 'ФВЛЖ',
        sex: 'пол',
        bmi: 'ИМТ кг/м2',
        emergency: 'Экстренноепплан=1, экстр.=0',
        surgery_date: 'дата операц',
        cpb_group: 'Группа (с ИК , без ИК)',
        diabetes: 'СД (Да=1/Нет=0)',
        af_type: 'НРС Фп или пароксизмы',
        parox_count: 'пикс сколько раз',
        angina_fc: 'ИБС(ФК)',
        chf_stage: 'хсн стадия',
        chf_fc: 'ХСН(ФК)', # Note: UI has chf_fc, sheet has XCH(ФК)
        cerebral_as: 'церебральный атеросклероз',
        nt_pro_bnp: 'НАТРИЙ-УРЕТИЧЕСКИЙ ПЕПТИД ПГ/МЛ;',
        syst_pa: 'СДЛАесли N-то 25, если больше то цифра',
        edv: 'КДО',
        protein: 'общий белок',
        urea: 'мочевина',
        creatinine: 'креатинин ммоль/л',
        ckd_stage: 'ХБП(стадия по CKD-EPI)', # Note: UI has ckd_stage, sheet has XBP(стадия по CKD-EPI)
        gfr: 'скф мл/мин',
        hemoglobin: 'гемоглобин г/л',
        # NYHA class is chf_fc in UI, mapped above
        smoking: 'курение(сигарет в день',
        euroscore: 'EuroSCORE II(в %)',
        leg_vessel_as: 'поражение сосоудов н/к',
        lv_aneurysm: 'аневризма ЛЖ'
         # Other sheet columns ('дата рождения', 'ХОБЛ(да/нет)', 'NYHA класс(1,2,3,4)', 'Хирург', 'Примечания', 'мини доступ', 'Исход', 'Тип операции (Решение)', 'Тип кардиоплегии', 'Искусственное кровообращение', 'Гипотермия', 'Планируется гемотрансфузия') are not directly mapped from UI fields, save_patient needs to handle missing keys.
    }


    # Create a list of all UI input components
    # Ensure this list matches the order of inputs passed to the save_patient_from_ui function
    all_ui_inputs_for_save = [id, age, sex, bmi, emergency, surgery_date, cpb_group, diabetes, smoking, euroscore,
                             ef, af_type, parox_count, angina_fc, chf_stage, chf_fc, cerebral_as, leg_vessel_as, lv_aneurysm,
                             nt_pro_bnp, syst_pa, edv, protein, urea, creatinine, ckd_stage, gfr, hemoglobin]


    def save_patient_from_ui(*ui_values):
        # Create a dictionary from the UI input components and their values
        ui_values_dict = dict(zip(all_ui_inputs_for_save, ui_values))

        # Create the data_dict for save_patient, mapping sheet column names to UI values
        data_dict = {}
        # Initialize data_dict with None for all sheet columns first
        for sheet_col in sheet_columns:
            data_dict[sheet_col] = None

        # Populate data_dict with values from UI inputs based on the mapping
        for ui_component, sheet_name in ui_to_sheet_mapping.items():
             if ui_component in ui_values_dict: # Check if the UI component was actually passed
                  data_dict[sheet_name] = ui_values_dict[ui_component]


        return save_patient(data_dict)


    # Correct the save.click syntax
    save.click(
        fn=save_patient_from_ui,
        inputs=all_ui_inputs_for_save, # Pass all relevant UI input components
        outputs=output
    )

    train.click(fn=train_model, outputs=output)
    predict.click(fn=predict_outcome, inputs=model_inputs_order, outputs=output) # Use the correct input order for prediction

demo.launch()

# --- End of the corrected Gradio interface code ---

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://fa79de62d3b01994bf.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




## Тестирование

### Subtask:
Провести всестороннее тестирование модели с новыми данными.


**Reasoning**:
Test the trained model with new data sets, which is the main part of the current subtask. Prepare a few different data points and use the `predict_outcome` function to get predictions.



In [None]:
# Ensure the model is trained (or at least check if it exists)
# In the previous step, the model training failed due to data,
# but this step is about testing the predict function given the *expectation*
# that train_model *could* succeed with better data in the future.
# For the purpose of this test, we will create dummy test data.

if model is None:
    print("Модель не была обучена из-за недостатка данных. Тестирование прогнозирования с реальной моделью невозможно.")
    print("Выполняется тестирование функции predict_outcome с заглушкой модели (dummy model).")

    # Create a dummy predict_proba function for testing the predict_outcome logic
    class DummyModel:
        def predict_proba(self, X):
            # Simulate different probabilities based on input (e.g., higher EuroSCORE -> higher probability)
            # This is just for testing the function flow when model is None.
            # In a real scenario, the model would be trained.
            euroscore_index = 6 # Assuming EuroSCORE is the 7th feature (index 6)
            simulated_prob = X[:, euroscore_index] / 20.0 + np.random.rand(X.shape[0]) * 0.1 # Simple simulation
            simulated_prob = np.clip(simulated_prob, 0.01, 0.99) # Keep probabilities reasonable
            return np.column_stack([1 - simulated_prob, simulated_prob]) # Return shape (n_samples, 2)

    dummy_model = DummyModel()
    # Temporarily replace the global model with the dummy model for testing predict_outcome
    original_model = model
    model = dummy_model

try:
    # 2. Prepare several sets of "new data" for prediction.
    # These should match the order of X_columns used for training:
    # ["возраст на момент операции", "ФВЛЖ", "ИМТ кг/м2", "СД (Да=1/Нет=0)", "креатинин ммоль/л", "скф мл/мин", "EuroSCORE II(в %)"]
    test_patients_data = [
        # Patient 1: Low risk profile (young, good EF, low EuroSCORE)
        [45, 55, 25, 0, 0.8, 90, 2.0],
        # Patient 2: High risk profile (old, low EF, high EuroSCORE, comorbidities)
        [70, 30, 35, 1, 1.5, 45, 15.0],
        # Patient 3: Moderate risk
        [60, 40, 30, 1, 1.0, 60, 8.0],
         # Patient 4: Another moderate risk, slightly different profile
        [55, 50, 28, 0, 0.9, 80, 4.0]
    ]

    # Headers for displaying the test data
    test_data_headers = ["Возраст", "ФВ ЛЖ", "ИМТ", "СД", "Креатинин", "СКФ", "EuroSCORE II"]

    # 3. Use the predict_outcome function with each test data set.
    print("\nТестирование функции predict_outcome с новыми данными:")

    for i, patient_data in enumerate(test_patients_data):
        # Call the predict_outcome function
        # Need to unpack the list into arguments
        prediction_output = predict_outcome(*patient_data)

        # 4. Output the data sets and corresponding prediction probabilities.
        print(f"\nПациент {i+1} Данные:")
        for header, value in zip(test_data_headers, patient_data):
            print(f"  {header}: {value}")
        print(f"Прогноз: {prediction_output}")

    # 5. Analyze the predictions (done mentally or described below)
    # 6. Describe any observations or anomalies (described below)

    # If a dummy model was used, restore the original model variable
finally:
    if 'original_model' in locals():
        model = original_model
        print("\nВосстановлена оригинальная переменная модели.")


# Analysis of Predictions (Manual/Descriptive)
print("\nАнализ прогнозов:")
print("Наборы тестовых данных представляют пациентов с разным предполагаемым уровнем риска на основе признаков.")
print("Ожидается, что пациенты с более высокими значениями факторов риска (старый возраст, низкая ФВЛЖ, высокий EuroSCORE, СД, высокая креатинин, низкая СКФ) будут иметь более высокую прогнозируемую вероятность неблагоприятного исхода.")
print("Пациенты с более низкими значениями этих факторов риска, напротив, должны иметь более низкую вероятность.")

# Since the real model wasn't trained, the predictions from the dummy model are illustrative,
# not based on actual learned patterns. If the real model were trained, we would observe:
# - Whether predictions align with intuitive risk levels (e.g., high EuroSCORE -> high probability).
# - The range of predicted probabilities.
# - Any unexpected results (e.g., a seemingly low-risk patient with a high predicted probability).

# Due to the model being untrained, we cannot perform meaningful analysis of predictions based on a real model.
print("\nОграничения тестирования:")
print("Тестирование проводилось с использованием либо необученной модели (при реальном запуске), либо заглушки модели (при выполнении данного блока отдельно).")
print("Результаты прогнозирования не отражают истинную производительность или интерпретируемость модели на реальных данных.")
print("Для проведения содержательного тестирования и анализа необходимо успешно обучить модель на достаточном количестве качественных данных.")


Модель не была обучена из-за недостатка данных. Тестирование прогнозирования с реальной моделью невозможно.
Выполняется тестирование функции predict_outcome с заглушкой модели (dummy model).

Тестирование функции predict_outcome с новыми данными:

Пациент 1 Данные:
  Возраст: 45
  ФВ ЛЖ: 55
  ИМТ: 25
  СД: 0
  Креатинин: 0.8
  СКФ: 90
  EuroSCORE II: 2.0
Прогноз: Вероятность неблагоприятного исхода: 10.64%

Пациент 2 Данные:
  Возраст: 70
  ФВ ЛЖ: 30
  ИМТ: 35
  СД: 1
  Креатинин: 1.5
  СКФ: 45
  EuroSCORE II: 15.0
Прогноз: Вероятность неблагоприятного исхода: 78.75%

Пациент 3 Данные:
  Возраст: 60
  ФВ ЛЖ: 40
  ИМТ: 30
  СД: 1
  Креатинин: 1.0
  СКФ: 60
  EuroSCORE II: 8.0
Прогноз: Вероятность неблагоприятного исхода: 48.79%

Пациент 4 Данные:
  Возраст: 55
  ФВ ЛЖ: 50
  ИМТ: 28
  СД: 0
  Креатинин: 0.9
  СКФ: 80
  EuroSCORE II: 4.0
Прогноз: Вероятность неблагоприятного исхода: 22.1%

Восстановлена оригинальная переменная модели.

Анализ прогнозов:
Наборы тестовых данных представляют

## Summary:

### Data Analysis Key Findings

*   The dataset contains 15 entries and 40 columns, initially loaded as 'object' type.
*   Significant data quality issues were identified, including non-numeric values in columns intended to be numeric and a large number of missing values across many columns.
*   Crucially, the target variable "Исход" is entirely missing (all 15 entries) after attempting to convert it to a numeric type.
*   The small dataset size (15 entries) combined with extensive missing data, especially in the target variable, made meaningful statistical analysis and visualization of relationships with the target variable impossible.
*   Attempting to clean the data by dropping rows with missing values in the target column resulted in an empty DataFrame.

### Insights or Next Steps

*   **Data Acquisition and Cleaning are Critical:** The current dataset is severely limiting. Prioritize acquiring a larger, cleaner dataset with fewer missing values and correct data types, especially for the target variable "Исход".
*   **Re-evaluate Feature Selection:** Once a better dataset is available, perform empirical feature selection based on data analysis (e.g., correlation, feature importance from a preliminary model) rather than relying solely on predefined interface inputs.
