In [4]:
# ============================================================
# XGBoost + Tüm Feature Seti (summary + dQ/dV)
# 25 / 50 / 100 döngü için ayrı model
# ============================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor

# ------------------------------------------------------------
# 1) Veriyi yükle ve cycle_life'ı temizle
# ------------------------------------------------------------
df = pd.read_csv("../data/intermediate/features_early_cycles.csv")
print("Toplam satır:", len(df))

df = df.copy()
df["cycle_life"] = (
    df["cycle_life"]
    .astype(str)
    .str.strip()
    .str.replace(r"[\[\]]", "", regex=True)
)
df["cycle_life"] = pd.to_numeric(df["cycle_life"], errors="coerce")
print("cycle_life NaN sayısı:", df["cycle_life"].isna().sum())

# ------------------------------------------------------------
# 2) Aday feature listesi
# ------------------------------------------------------------
candidate_features = [
    "Qd_mean", 
    "IR_mean", "IR_std",
    "Tavg_mean",
    "dQd_slope",

    "dqdv_peak_first", "dqdv_peak_last", "dqdv_peak_delta",
    "dqdv_peak_mean", "dqdv_peak_std", "dqdv_peak_slope",

    "dqdv_area_first", "dqdv_area_last", "dqdv_area_delta",
    "dqdv_area_mean", "dqdv_area_std", "dqdv_area_slope",

    "dqdv_peakpos_first", "dqdv_peakpos_last", "dqdv_peakpos_delta",
    "dqdv_peakpos_mean", "dqdv_peakpos_std", "dqdv_peakpos_slope",

    "dqdv_var_first", "dqdv_var_last", "dqdv_var_delta",
    "dqdv_var_mean", "dqdv_var_std", "dqdv_var_slope",

    "Qd_slope", "Qd_intercept", "Qd_first", "Qd_last", "Qd_delta",
]

feature_cols = [c for c in candidate_features if c in df.columns]
print("\nKullanılacak feature sayısı:", len(feature_cols))

# NaN temizliği
df = df.dropna(subset=["cycle_life"] + feature_cols)
print("Temizlik sonrası satır sayısı:", len(df))

# ------------------------------------------------------------
# 3) Eğitim fonksiyonu
# ------------------------------------------------------------
def train_xgb(X, y, params):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    model = XGBRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return model, mae, r2, X_test, y_test, y_pred

# ------------------------------------------------------------
# 4) Parametreler
# ------------------------------------------------------------
params_dict = {
    25: {
        "n_estimators": 300,
        "learning_rate": 0.01,
        "max_depth": 3,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "reg_lambda": 4.0,
        "reg_alpha": 3.0,
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
    },
    50: {
        "n_estimators": 500,
        "learning_rate": 0.02,
        "max_depth": 4,
        "subsample": 0.85,
        "colsample_bytree": 0.85,
        "reg_lambda": 3.0,
        "reg_alpha": 1.5,
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
    },
    100: {
        "n_estimators": 800,
        "learning_rate": 0.03,
        "max_depth": 6,
        "subsample": 0.80,
        "colsample_bytree": 0.80,
        "reg_lambda": 2.0,
        "reg_alpha": 1.0,
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
    }
}

results = {}

# ------------------------------------------------------------
# 5) 25 / 50 / 100 döngü için model eğit
# ------------------------------------------------------------
for n_cycles in [25, 50, 100]:

    print(f"\n{'='*60}")
    print(f"n_cycles = {n_cycles} için XGBoost modeli")
    print(f"{'='*60}")

    df_sub = df[df["n_cycles"] == n_cycles].copy()
    print("Bu n_cycles için satır sayısı:", len(df_sub))

    if len(df_sub) < 10:
        print("⚠ Yeterli veri yok, bu n_cycles atlanıyor.")
        continue  # *** BU SATIR ARTIK DOĞRU HİZADA ***

    X = df_sub[feature_cols].values
    y = df_sub["cycle_life"].values

    model, mae, r2, X_test, y_test, y_pred = train_xgb(X, y, params_dict[n_cycles])

    print(f"MAE: {mae:.2f}")
    print(f"R² : {r2:.4f}")

    importances = pd.Series(model.feature_importances_, index=feature_cols)
    importances = importances.sort_values(ascending=False)

    print("\nÖzellik önemleri (ilk 10):")
    for feat, imp in importances.head(10).items():
        print(f"{feat:25s}: {imp:.4f}")

    results[n_cycles] = {"MAE": mae, "R2": r2}

# ------------------------------------------------------------
# 6) Özet sonuçlar
# ------------------------------------------------------------
print("\n\n=== XGBoost + Full Feature Set ÖZET SONUÇLAR ===")
for n, res in results.items():
    print(f"n_cycles = {n:3d} → MAE = {res['MAE']:.2f}, R² = {res['R2']:.4f}")


Toplam satır: 138
cycle_life NaN sayısı: 0

Kullanılacak feature sayısı: 23
Temizlik sonrası satır sayısı: 138

n_cycles = 25 için XGBoost modeli
Bu n_cycles için satır sayısı: 46
MAE: 139.99
R² : 0.2872

Özellik önemleri (ilk 10):
dqdv_peak_last           : 0.1811
Tavg_mean                : 0.1180
dqdv_area_slope          : 0.0911
dqdv_area_delta          : 0.0810
dqdv_peakpos_mean        : 0.0721
dqdv_peakpos_std         : 0.0720
dQd_slope                : 0.0658
dqdv_peak_slope          : 0.0508
IR_mean                  : 0.0421
dqdv_area_mean           : 0.0395

n_cycles = 50 için XGBoost modeli
Bu n_cycles için satır sayısı: 46
MAE: 120.69
R² : 0.4846

Özellik önemleri (ilk 10):
dqdv_area_slope          : 0.1934
dqdv_peakpos_std         : 0.1702
dqdv_peakpos_mean        : 0.1502
Tavg_mean                : 0.0988
dqdv_peakpos_slope       : 0.0730
dqdv_peak_last           : 0.0399
dqdv_area_delta          : 0.0314
IR_mean                  : 0.0307
dqdv_peak_slope          : 0.0287
d