In [2]:
import pandas as pd
import numpy as np
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import logrank_test
from itertools import permutations


def run_survival_analysis(df, duration_col, event_col, output_excel, output_suffix):
    label_cols = [c for c in df.columns if c.endswith('Label')]

    with pd.ExcelWriter(output_excel, engine='xlsxwriter') as writer:
        for label in label_cols:
            kmf = KaplanMeierFitter()
            medians = {}

            for grp in df[label].dropna().unique():
                mask = df[label] == grp
                kmf.fit(df.loc[mask, duration_col], event_observed=df.loc[mask, event_col])
                medians[grp] = kmf.median_survival_time_

            groups = sorted(medians.keys())
            rows = []

            for ref, comp in permutations(groups, 2):
                lr = logrank_test(
                    df.loc[df[label] == ref, duration_col],
                    df.loc[df[label] == comp, duration_col],
                    event_observed_A=df.loc[df[label] == ref, event_col],
                    event_observed_B=df.loc[df[label] == comp, event_col]
                )

                df_pair = df.loc[df[label].isin([ref, comp]), [duration_col, event_col, label]].dropna()
                df_pair['grp_code'] = (df_pair[label] == comp).astype(int)

                fit_df = df_pair[[duration_col, event_col, 'grp_code']].astype({
                    duration_col: float,
                    event_col: int,
                    'grp_code': int
                })

                cph = CoxPHFitter()
                cph.fit(fit_df, duration_col=duration_col, event_col=event_col)

                row = cph.summary.loc['grp_code']
                hr = row['exp(coef)']
                ci_low = row['exp(coef) lower 95%']
                ci_up = row['exp(coef) upper 95%']

                rows.append({
                    'Reference': ref,
                    'Group': comp,
                    'MST (month)': f"{medians[comp]:.1f}",
                    'Hazard ratio': f"{hr:.3f}",
                    '95% CI': f"{ci_low:.3f}â€“{ci_up:.3f}",
                    'P value': f"{lr.p_value:.3f}"
                })

            for ref in groups:
                rows.append({
                    'Reference': ref,
                    'Group': ref,
                    'MST (month)': '-',
                    'Hazard ratio': '1',
                    '95% CI': '-',
                    'P value': '-'
                })

            result_df = pd.DataFrame(rows).sort_values(['Reference', 'Group'])
            result_df.to_csv(f"{label}_{output_suffix}.csv", index=False)
            result_df.to_excel(writer, sheet_name=label[:31], index=False)


# Run for Overall Survival (OS)
os_df = pd.read_csv('survival_data.csv')
run_survival_analysis(
    df=os_df,
    duration_col='Survival Months',
    event_col='E_Survival',
    output_excel='Overall_Survival.xlsx',
    output_suffix='Overall_Survival'
)

# Run for Disease-Free Survival (DFS)
dfs_df = pd.read_csv('survival_data.csv')
run_survival_analysis(
    df=dfs_df,
    duration_col='Relapse Months',
    event_col='E_Relapse',
    output_excel='Disease_Free_Survival.xlsx',
    output_suffix='Disease_Free_Survival'
)



>>> events = df['E_Relapse'].astype(bool)
>>> print(df.loc[events, 'grp_code'].var())
>>> print(df.loc[~events, 'grp_code'].var())

A very low variance means that the column grp_code completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



>>> events = df['E_Relapse'].astype(bool)
>>> print(df.loc[events, 'grp_code'].var())
>>> print(df.loc[~events, 'grp_code'].var())

A very low variance means that the column grp_code completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



>>> events = df['E_Relapse'].astype(bool)
>>> print(df.loc[events, 'grp_code'].var())
>>> print(df.loc[~events, 'grp_code'].var())

A very low variance means that the column grp_code completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-de