In [None]:
!pip install lifelines

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter

from sklearn import preprocessing
import itertools
import random
from scipy.spatial import distance as ssd
import scipy
from tqdm import tqdm
import scipy.spatial.distance
from lifelines import KaplanMeierFitter
#logrank_test
from lifelines.statistics import logrank_test
plt.ion()
plt.show()
import pickle
import os
import sys
sys.path.append("..")

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
random_state=0
random.seed( random_state )
np.random.seed(random_state)


In [None]:
from lifelines.datasets import load_waltons
df = load_waltons() # returns a Pandas DataFrame
print(df.shape)
print(df.head())
"""
    T  E    group
0   6  1  miR-137
1  13  1  miR-137
2  13  1  miR-137
3  13  1  miR-137
4  19  1  miR-137
"""

T = df['T']
E = df['E']

In [None]:
df.group.value_counts()

In [None]:
df[df["E"] == 0]

In [None]:
filename ="BRCA"#"KIRP" #
df = pd.read_pickle(f"data/rna_data/{filename}_survival.pkl")
solutions = pd.read_pickle(f"data/rna_data/{filename}_solutions.pkl")
print(df.shape)
df.head()

In [None]:
df["vital_status"].value_counts()

In [None]:
df["event"] = df["vital_status"].apply(lambda x : int(x == 'dead'))

df["t"] = df.apply(lambda row: float(row["days_to_last_followup"])
                   if row["vital_status"] == "alive" else float(row["days_to_death"]),
                   axis=1)

df = df[df.columns[-(solutions.shape[0] +2 ):]]

df = df.dropna()

df

In [None]:
n_cols = 5
n_rows = solutions.shape[0]//n_cols
plt.figure(figsize=(16, n_rows * 3))
significant = []
for i in range(20):
    feature = f"subspace_{i}"
    ax = plt.subplot(n_rows,n_cols,i+1)
    kmf = KaplanMeierFitter()
    values = df[feature].unique()
    p_value = ""
    if len(values) == 2:
        results=logrank_test(df[df[feature] == values[0]]["t"].values,
                         df[df[feature] == values[1]]["t"].values,
                         event_observed_A=df[df[feature] == values[0]]["event"].values, 
                         event_observed_B=df[df[feature] == values[1]]["event"].values)
        p_value = f"p = {round(results.p_value, 3)}"
        if results.p_value < 0.05:
            fontweight = "bold"
            significant.append(i)
        else:
            fontweight = "normal"

    for name, grouped_df in df.groupby(feature):
        kmf.fit(grouped_df["t"], grouped_df["event"], label=name)
        kmf.plot(ax=ax)
    plt.title(f"Subspace {i+1}, {p_value}", fontweight = fontweight);
# plt.suptitle(f"Survival curves and logrank test p-values for {filename}")
plt.tight_layout()
plt.savefig(f"images/{filename}_survival.pdf", bbox_inches='tight')

In [None]:
n_cols = len(significant)
n_rows = 1
plt.figure(figsize=(5* len(significant), n_rows * 3))

for ii, i in enumerate(significant):
    feature = f"subspace_{i}"
    ax = plt.subplot(n_rows,n_cols,ii+1)
    kmf = KaplanMeierFitter()
    values = df[feature].unique()
    p_value = ""
    if len(values) == 2:
        results=logrank_test(df[df[feature] == values[0]]["t"].values,
                         df[df[feature] == values[1]]["t"].values,
                         event_observed_A=df[df[feature] == values[0]]["event"].values, 
                         event_observed_B=df[df[feature] == values[1]]["event"].values)
        p_value = f"p = {round(results.p_value, 3)}"


    for name, grouped_df in df.groupby(feature):
        kmf.fit(grouped_df["t"], grouped_df["event"], label=name)
        kmf.plot(ax=ax)
    plt.title(f"{filename} Subspace {i+1}, {p_value}", fontweight = fontweight);
# plt.suptitle(f"Survival curves and logrank test p-values for {filename}")
plt.tight_layout()
plt.savefig(f"images/imp_{filename}_survival.pdf", bbox_inches='tight')

In [None]:
feature = f"gender"
ax = plt.subplot(111)

kmf = KaplanMeierFitter()

values = df[feature].unique()
p_value = ""
if len(values) == 2:
    results=logrank_test(df[df[feature] == values[0]]["t"].values,
                     df[df[feature] == values[1]]["t"].values,
                     event_observed_A=df[df[feature] == values[0]]["event"].values, 
                     event_observed_B=df[df[feature] == values[1]]["event"].values)
    p_value = f"\nLogrank test p-value {round(results.p_value, 3)}"
    if results.p_value < 0.05:
        p_value += "=> Significantly different"

for name, grouped_df in df.groupby(feature):
    kmf.fit(grouped_df["t"], grouped_df["event"], label=name)
    kmf.plot(ax=ax)
plt.title(f"KaplanMeier curve for {feature}, {p_value}");

In [None]:
df.gender.value_counts()