In [1]:
import numpy as np
import pandas as pd
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sksurv.metrics import brier_score

# 예제 데이터 로드 (임의의 데이터셋 사용)
from lifelines.datasets import load_rossi

data = load_rossi()

# 데이터 전처리
X = data.drop(columns=["week", "arrest"])
y = data[["week", "arrest"]]


In [17]:
import numpy as np
import pandas as pd

# scikit-survival 관련
from sksurv.metrics import concordance_index_censored, brier_score

# lifelines
from lifelines import CoxPHFitter

# 사이킷런
from sklearn.model_selection import KFold


def transform_to_structured(df):
    """
    y의 arrest(이벤트 여부), week(생존 시간)을
    scikit-survival에서 요구하는 structured array로 변환합니다.
    """
    # event: arrest (bool), time: week (float)
    return np.array(
        [(df["arrest"].iloc[i], df["week"].iloc[i]) for i in range(len(df))],
        dtype=[("event", bool), ("time", float)],
    )


# 10-fold 교차 검증 설정
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# 각 모델의 성능 지표를 저장할 리스트
coxph_cindices = []
brier_scores_list = []

# 교차 검증
for train_index, test_index in kf.split(X):
    # 훈련/테스트 데이터셋 분리
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # scikit-survival 포맷으로 변환
    y_train_structured = transform_to_structured(y_train)
    y_test_structured = transform_to_structured(y_test)
    
    # Cox Proportional Hazards Model (lifelines)
    coxph = CoxPHFitter()
    # pd.concat([...], axis=1) 대신 join() 사용 예시
    coxph.fit(X_train.join(y_train), duration_col="week", event_col="arrest")

    # 예측(Partial Hazard) -> C-index 계산을 위해 사용
    coxph_pred_partial_hazard = coxph.predict_partial_hazard(X_test)
    
    # scikit-survival의 C-index 계산
    # concordance_index_censored의 시그니처:
    # (event_indicator, event_time, estimate) -> (cindex, _, _)
    cindex_tuple = concordance_index_censored(
        y_test_structured["event"],
        y_test_structured["time"],
        coxph_pred_partial_hazard
    )
    cindex = cindex_tuple[0]
    coxph_cindices.append(cindex)

    # ---------------------------------------------------------
    # Brier Score 계산
    # ---------------------------------------------------------
    # 예측 생존함수를 (n_samples, n_times) 형태의 np.array로 변환
    max_time_test = int(y_test["week"].max())
    times = np.arange(1, max_time_test + 1)

    # lifelines의 predict_survival_function() 결과는
    # 각 샘플에 대해 pd.Series 형태가 리스트로 반환됩니다.
    surv_funcs = coxph.predict_survival_function(X_test, times=times)
    
    # 각 시간축별 생존확률을 2차원 배열(n_test_samples, n_times)로 변환
    if isinstance(surv_funcs, pd.Series):
        # 테스트샘플이 단 1개라면, Series를 바로 2차원 배열로 변환
        surv_probs = np.array([surv_funcs.values])
    else:
        # 여러 개의 Series가 리스트로 묶여 있으면, 기존처럼 처리
        surv_probs = np.transpose([sf.values for sf in surv_funcs])
    # Brier Score 계산
    # brier_score(y_train_structured, y_test_structured, surv_probs, times)
    # 훈련 데이터가 필요 없는 경우도 있지만,
    # times별 정확한 비교를 위해 y_train_structured를 넣어주는 경우가 많습니다.
    bs = brier_score(
        y_train_structured,   # 훈련 데이터 (scikit-survival 포맷)
        y_test_structured,    # 테스트 데이터 (scikit-survival 포맷)
        surv_probs,           # 예측 생존확률 (shape: n_test, n_times)
        times                 # 평가할 시간 리스트
    )
    # times별 Brier score가 반환되므로, 필요에 따라 평균 또는 특정 시점의 값을 취함
    mean_bs = np.mean(bs)
    print(mean_bs)
    brier_scores_list.append(mean_bs)

# # 교차검증 결과 요약
# print("CoxPH C-indices:", coxph_cindices)
# print("Brier Scores (mean over times):", brier_scores_list)

# print("\n평균 C-index:", np.mean(coxph_cindices))
# print("평균 Brier Score:", np.mean(brier_scores_list))


AttributeError: 'int' object has no attribute 'values'

In [None]:

# C-index 박스 플롯
plt.figure(figsize=(10, 5))
plt.boxplot(coxph_cindices, vert=True, patch_artist=True)
plt.title("CoxPH Model C-index (10-fold CV)")
plt.ylabel("Concordance Index")
plt.show()

# Brier Score 꺾은선 그래프
plt.figure(figsize=(10, 5))
mean_brier_scores = np.mean(brier_scores, axis=0)
plt.plot(times, mean_brier_scores, label="CoxPH Brier Score")
plt.title("Brier Score over Time (CoxPH)")
plt.xlabel("Time")
plt.ylabel("Brier Score")
plt.legend()
plt.show()

# 결과 출력
print(f"CoxPH C-indices: {coxph_cindices}")
print(f"Mean C-index: {np.mean(coxph_cindices):.4f}")

In [None]:
# Cox Proportional Hazards Model
coxph = CoxPHFitter()
coxph.fit(
    pd.concat([X_train, y_train], axis=1), duration_col="week", event_col="arrest"
)

# 예측과 성능 평가 (C-index)
coxph_pred = coxph.predict_partial_hazard(X_test)
cindex = concordance_index(y_test["week"], -coxph_pred, y_test["arrest"])
coxph_cindices.append(cindex)

# Brier Score 계산
times = np.arange(1, y_test["week"].max() + 1)
surv_probs = coxph.predict_survival_function(X_test, times=times).T
brier = brier_score(y_test_structured, surv_probs, times=times)
brier_scores.append(brier)

# C-index 박스 플롯
plt.figure(figsize=(10, 5))
plt.boxplot(coxph_cindices, vert=True, patch_artist=True)
plt.title("CoxPH Model C-index (10-fold CV)")
plt.ylabel("Concordance Index")
plt.show()

# Brier Score 꺾은선 그래프
plt.figure(figsize=(10, 5))
mean_brier_scores = np.mean(brier_scores, axis=0)
plt.plot(times, mean_brier_scores, label="CoxPH Brier Score")
plt.title("Brier Score over Time (CoxPH)")
plt.xlabel("Time")
plt.ylabel("Brier Score")
plt.legend()
plt.show()

# 결과 출력
print(f"CoxPH C-indices: {coxph_cindices}")
print(f"Mean C-index: {np.mean(coxph_cindices):.4f}")