In [2]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

In [3]:
# 📌 1. Загрузка данных
df = pd.read_csv('data/train_transform.csv')

In [4]:

# 📌 2. Кодирование категориальных признаков (если есть)
encoder = LabelEncoder()
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = encoder.fit_transform(df[col])

# 📌 3. Разделение данных
X = df.drop(columns=["Survived"])  # Замените "Survived" на свою целевую переменную
y = df["Survived"]

# 📌 4. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 📌 5. Список моделей
models = {
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting (Sklearn)": GradientBoostingClassifier(),
    "HistGradientBoosting (Sklearn)": HistGradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "XGBoost": xgb.XGBClassifier(),
    "LightGBM": lgb.LGBMClassifier(),
    "CatBoost": CatBoostClassifier(silent=True)
}

# 📌 6. Обучение моделей и расчет метрик
metrics = []
best_model = None
best_score = 0
best_model_name = ""

for name, model in models.items():
    # Обучение модели
    model.fit(X_train, y_train)
    
    # Прогноз
    y_pred = model.predict(X_test)
    
    # Метрики
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # Проверяем, является ли текущая модель лучшей
    if accuracy > best_score:  # Можно заменить на f1 для баланса
        best_score = accuracy
        best_model = model
        best_model_name = name
    
    # Добавляем в список метрик
    metrics.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

# 📌 7. Вывод таблицы с метриками
metrics_df = pd.DataFrame(metrics)
print(metrics_df)

# 📌 8. Сохранение лучшей модели
if best_model:
    joblib.dump(best_model, f"best_model_{best_model_name}.pkl")
    print(f"Лучшая модель: {best_model_name} (Accuracy: {best_score:.4f}) сохранена как 'best_model_{best_model_name}.pkl'")


[WinError 2] Не удается найти указанный файл
  File "C:\Users\14488\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
          

[LightGBM] [Info] Number of positive: 231, number of negative: 392
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000354 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 842
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.370787 -> initscore=-0.528844
[LightGBM] [Info] Start training from score -0.528844
                            Model  Accuracy  Precision    Recall  F1 Score
0                   Random Forest  0.832090   0.831953  0.832090  0.830554
1     Gradient Boosting (Sklearn)  0.832090   0.832445  0.832090  0.830205
2  HistGradientBoosting (Sklearn)  0.817164   0.816695  0.817164  0.815492
3             Logistic Regression  0.694030   0.715491  0.694030  0.663778
4                         XGBoost  0.817164   0.816446  0.817164  0.815849