# Survival *lifelines*
[https://lifelines.readthedocs.io/](https://lifelines.readthedocs.io/en/latest/Survival%20Regression.html)

In [None]:
%load_ext watermark
%watermark -v -m  -u -n -p pandas,numpy,matplotlib,lifelines -a Filippo_Valle -g -r -b -w

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
os.chdir("/home/jovyan/work/phd/datasets/cancers/breast")

In [None]:
#df_files = pd.read_csv("files.txt", sep="\t").set_index("file_name").dropna(how="all", thresh=5, axis=1)

df_files = pd.read_csv("files.dat", sep=",",index_col=0).dropna(how="all", thresh=100, axis=1)
#df_files = df_files[df_files["dataset"]=="tcga"]
df_files.info()

In [None]:
df_files["cases.0.diagnoses.0.last_known_disease_status"].unique()

In [None]:
bins = np.linspace(0,100,20)
avg = df_files["cases.0.diagnoses.0.age_at_diagnosis"].mean(skipna=True)
df_files["age_at_diagnosis"]=pd.cut(df_files["cases.0.diagnoses.0.age_at_diagnosis"].fillna(avg)/365., bins=bins, labels = (bins[1:]+bins[:-1])/2).__array__()

In [None]:
df_files["gender"] = (df_files["cases.0.demographic.gender"]=="male").astype(int)

In [None]:
for letter in ["a", "b", "c"]:
    for (old, new) in zip(["stage %s%s"%(i,letter) for i in ["i", "ii", "iii", "iv"]],["stage %s"%i for i in ["i", "ii", "iii", "iv"]]):
        df_files.replace(old, new, inplace=True)
df_files["cases.0.diagnoses.0.tumor_stage"].unique()

In [None]:
df_files["tumor_stage"]=df_files["cases.0.diagnoses.0.tumor_stage"]
for i,stage in enumerate(["stage i", "stage ii", "stage iii", "stage iv", "stage v", "stage x"]):
    df_files["tumor_stage"].replace(stage, i+1, inplace=True)

In [None]:
#0 = Alive
df_files["vital_status"]=(df_files["cases.0.demographic.vital_status"]=="Dead").astype(int)

In [None]:
df_files.groupby(["vital_status","cases.0.demographic.vital_status"]).count()

In [None]:
print(df_files["cases.0.diagnoses.0.last_known_disease_status"].unique())
(df_files["cases.0.diagnoses.0.age_at_diagnosis"]/365).hist(bins=5)
plt.show()

In [None]:
(df_files["cases.0.diagnoses.0.days_to_last_follow_up"]/365).hist(bins=5)
plt.show()

In [None]:
(df_files["cases.0.demographic.days_to_death"]/365).hist(bins=5)

In [None]:
subset[subset["cases.0.submitter_id"].isin(list(filter(lambda s: "B6-A0WT" in s, subset["cases.0.submitter_id"].values)))]

In [None]:
def get_survival(case):
    if case["cases.0.demographic.vital_status"] == 1:
        return case["cases.0.demographic.days_to_death"]
    else:
        return case["cases.0.diagnoses.0.days_to_last_follow_up"]

df_files["days_survival"] = df_files.apply(get_survival,1)

In [None]:
bins = np.linspace(0,100,10)
df_files["smoke"] = pd.cut(df_files["cases.0.exposures.0.years_smoked"], bins=bins, labels = (bins[1:]+bins[:-1])/2 ).__array__()
df_files["smoke"].fillna(0, inplace=True)

In [None]:
(df_files["days_survival"]/365).hist()

In [None]:
subset = df_files[~df_files["days_survival"].isna()].sample(80)

In [None]:
from lifelines.plotting import plot_lifetimes

CURRENT_TIME = 5

actual_lifetimes = subset["days_survival"].to_numpy()/365
observed_lifetimes = np.minimum(actual_lifetimes, CURRENT_TIME)
death_observed = actual_lifetimes < CURRENT_TIME

ax = plot_lifetimes(observed_lifetimes, event_observed=death_observed, figsize=(18,15))

ax.set_xlim(0, CURRENT_TIME*1.1)
ax.vlines(CURRENT_TIME, 0, 30, lw=2, linestyles='--')
ax.set_xlabel("time (years)", fontsize=35)
ax.tick_params(labelsize=35)
ax.set_title(f"Births and deaths of our population, at $t={CURRENT_TIME}$", fontsize=35)
plt.tight_layout()
#print("Observed lifetimes at time %d:\n" % (CURRENT_TIME), observed_lifetimes)

In [None]:
df_topics =pd.read_csv("topsbm/topsbm_level_2_topic-dist.csv", index_col=1).drop("i_doc",1)
df_topics = df_topics.subtract(df_topics.mean(0),1)
print(df_topics[df_topics.index.isin(pd.read_csv("topsbm/topsbm_level_3_clusters.csv")["Cluster 1"].dropna())].idxmax(1).hist())
signature = df_topics[df_topics.index.isin(pd.read_csv("topsbm/topsbm_level_3_clusters.csv")["Cluster 1"].dropna())].mean(0)
from scipy.stats import entropy, pearsonr
kl_survival = []
for sample in df_topics.index:
    h = entropy(df_topics.loc[sample,:].values,signature)
    #h = np.mean(df_topics.loc[sample,:].values-signature)
    ds = df_files.loc[sample,"days_survival"]
    if (h < np.inf) and (h!=np.nan) and (str(ds) != "nan"):
        kl_survival.append([h, ds/365])
kl_survival[:1]

In [None]:
for g in pd.read_csv("topsbm/topsbm_level_3_topics.csv", index_col=0)["Topic 3"].dropna():
    print(g[:15])

In [None]:
plt.plot(list(zip(*kl_survival))[0],list(zip(*kl_survival))[1], lw=0, marker="o")
plt.title(pearsonr(list(zip(*kl_survival))[0],list(zip(*kl_survival))[1])[0])
plt.xlabel("Kullbach libler da signature cluster 1")
plt.ylabel("years survival")
plt.show()

In [None]:
algo = "topsbm"
l = 3
pd.read_csv(f"{algo}/{algo}_level_{l}_clusters.csv").columns

In [None]:
from lifelines.plotting import plot_lifetimes

for cluster in ["Cluster %d"%(c+1) for c in [0,4]]:
#for subtype in df_files["Subtype_Selected"].unique():
    subset = df_files[(~df_files["days_survival"].isna()) & df_files.index.isin(pd.read_csv(f"{algo}/{algo}_level_{l}_clusters.csv")[cluster].dropna())]
    #subset = df_files[(~df_files["days_survival"].isna()) & (df_files["Subtype_Selected"]==subtype)]
    
    if len(subset) > 500:
        subset = subset.sample(100)
    
    CURRENT_TIME = 3

    actual_lifetimes = subset["days_survival"].to_numpy()/365
    observed_lifetimes = np.minimum(actual_lifetimes, CURRENT_TIME)
    death_observed = actual_lifetimes < CURRENT_TIME
    survival = 1 - death_observed.sum()/len(actual_lifetimes)

    ax = plot_lifetimes(observed_lifetimes, event_observed=death_observed, figsize=(18,15))

    survivals = []
    for stat in range(1000):
        subset = df_files[(~df_files["days_survival"].isna())].sample(len(pd.read_csv(f"{algo}/{algo}_level_{l}_clusters.csv")[cluster].dropna()))
        actual_lifetimes = subset["days_survival"].to_numpy()/365
        observed_lifetimes = np.minimum(actual_lifetimes, CURRENT_TIME)
        death_observed = actual_lifetimes < CURRENT_TIME
        survivals.append(1 - death_observed.sum()/len(actual_lifetimes))
    
    Z = np.abs(survival-np.average(survivals))/np.std(survivals)
    print(survival, np.average(survivals), np.std(survivals))
    
    ax.get_figure().set_size_inches(12,10)
    ax.set_xlim(0, CURRENT_TIME*1.1)
    ax.set_ylabel("patients", fontsize=35)
    ax.vlines(CURRENT_TIME, 0, len(subset), lw=10, linestyles='--')
    ax.set_xlabel("time (years)", fontsize=35)
    ax.tick_params(labelsize=25)
    ax.set_title(f"{cluster} population at $t={CURRENT_TIME}$,\n survival {survival.round(2)} (Z={round(Z,2)})", fontsize=15)
    plt.tight_layout()
    ax.get_figure().savefig(f"survival_{algo}_{cluster}_t{CURRENT_TIME}.pdf")
    plt.show()
#print("Observed lifetimes at time %d:\n" % (CURRENT_TIME), observed_lifetimes)

In [None]:
subset = df_files[~df_files["days_survival"].isna()]
data = {}
data["duration"]=subset["days_survival"]/365
data["observed"]=subset["vital_status"]
data["entry"]=subset["cases.0.demographic.days_to_birth"]/365

In [None]:
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()

In [None]:
df_files["vital_status"].sum()

In [None]:
T = data["duration"]
E = data["observed"]
entry = data["entry"]

In [None]:
kmf.fit(T, event_observed=E)
ax = kmf.plot(figsize=(18,15),show_censors=True)

ax.set_title('Survival function of Breast', fontsize=35)
ax.set_xlabel("time (years)", fontsize=35)
ax.set_ylabel("Survival(t)", fontsize=35)
ax.tick_params(labelsize=30)
ax.set_xlim(0,25)
ax.set_ylim(0,1)

In [None]:
from lifelines.utils import median_survival_times
median_ci = median_survival_times(kmf.confidence_interval_)
print(kmf.median_survival_time_, "+-", median_ci)

In [None]:
ax = plt.subplot(111)

mask = (subset["cases.0.demographic.gender"]=="male")

kmf.fit(T[mask], event_observed=E[mask], label="male")
kmf.plot(ax=ax)

kmf.fit(T[~mask], event_observed=E[~mask], label="female")
kmf.plot(ax=ax)

plt.ylim(0, 1)
plt.xlim(0,15)
plt.title("Survival of different cancer types")

In [None]:
(df_files[(df_files["Subtype_Selected"]=="BRCA.Basal") & df_files["vital_status"]==1]["days_survival"]/365).hist()

In [None]:
for c in df_files[df_files["Subtype_Selected"]=="BRCA.LumB"]["cases.0.submitter_id"].values:
    print(c,",")

In [None]:
key = 'Subtype_Selected_num'
titles, subset[key]=np.unique(subset["Subtype_Selected"].fillna("unknown"), return_inverse=True)
labels = subset[key].unique()

fig, axs = plt.subplots(3, 3, figsize=(18,15))
titles = ["Cluster %d"%(c+1) for c in range(8)]
#for i, label in enumerate(np.sort(labels)):
for i,cluster in enumerate(["Cluster %d"%(c+1) for c in range(8)]):
 
    ax = axs.ravel()[i]
    
    #ix = subset[key] == label
    label = cluster
    ix =  subset.index.isin(pd.read_csv("topsbm/topsbm_level_3_clusters.csv")[cluster].dropna())
    
    kmf.fit(T[ix], E[ix], label=label)
    kmf.plot(ax=ax, legend=False)
    

    ax.set_title(titles[i]+f"({ix.sum()})")
    ax.set_xlim(0, 25)
    ax.set_ylim(0.8,1)
    
    if i==0:
        plt.ylabel('Frac. alive after $n$ days')

plt.tight_layout()

In [None]:
import importlib, survival
importlib.reload(survival)
from survival import fit_cox, add_group_to_subset, save_plot

In [None]:
df_clusters = pd.read_csv("topsbm/topsbm_level_1_topic-dist.csv",index_col=1).drop("i_doc",1)
#df_clusters = pd.read_csv("lda/lda_level_2_topic-dist.csv",index_col=1).drop("i_doc",1)

#df_clusters = pd.read_csv("topsbm/topsbm_level_1_topic-dist.csv",index_col=1).drop("i_doc",1)
#df_clusters = df_clusters[df_clusters.index.isin(filter(lambda doc: "GTEX" not in doc,df_clusters.index))]

In [None]:
df_files.head(2)

In [None]:
for dataset in ["TCGA-LUAD", "TCGA-LUSC"]:
    mask = (~df_files["days_survival"].isna()) & (df_files["cases.0.diagnoses.0.tumor_stage"]!="not reported") & (df_files["cases.0.project.project_id"]==dataset)
    subset = df_files[mask]
    subset = subset[["days_survival","vital_status","gender", "tumor_stage", "age_at_diagnosis"]]
    for topic in df_clusters.columns:
        top_set = add_group_to_subset(topic, subset, df_clusters, 0.5)
        print(top_set["group"].sum())
        summary, _, ax = fit_cox(top_set, topic)
        if summary is not None:
            if summary.at[summary.index[-1],"-log2(p)"] > 3:
                print(dataset,": ",topic,"\n",summary.loc[summary.index[-1],["coef", "p"]],"\n")
        if ax is not None:
            ax.set_title(dataset+" "+ax.title.get_text(), fontsize=35)
            save_plot(ax, dataset, topic)

In [None]:
mask = (~df_files["days_survival"].isna()) & (df_files["cases.0.diagnoses.0.tumor_stage"]!="not reported")
subset = df_files[mask]
subset = subset[["days_survival","vital_status","gender", "tumor_stage", "age_at_diagnosis"]]
for topic in df_clusters.columns:
    top_set = add_group_to_subset(topic, subset, df_clusters, 0.5)
    print(topic, top_set["group"].sum())
    summary, _, ax = fit_cox(top_set, topic)
    if summary is not None:
        if summary.at[summary.index[-1],"-log2(p)"] > 3:
            print(topic,"\n",summary.loc[summary.index[-1],["coef", "p"]],"\n")
    if ax is not None:
        ax.set_title(ax.title.get_text(), fontsize=35)
        save_plot(ax, "all", topic)

In [None]:
summary, _, ax = fit_cox(top_set, "Topic 3")
print(summary[["coef", "exp(coef)", "p", "-log2(p)", "corrected_p", "-log2(corrected_p)"]])
save_plot(ax, "all", "Topic 3")

In [None]:
for g in pd.read_csv("datasets/cancers/lung/lda/lda_level_2_topics.csv",index_col=1)["Topic 18"].values:
    print(g[:15])

In [None]:
(pd.read_csv("datasets/cancers/lung/topsbm/topsbm_level_1_topics.csv",index_col=1)=="ENSG00000121552").any()