In [1]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split

In [2]:
## CALCULATE STROMA SCORE
# df = pd.read_excel("../data/TCGA_MEASUREMENTS.xlsx")
# Should we compute score before or after split, or in general have same cuts for all (as technically we don't know 
# labels for the test set)
SEED = 42

def specificity(y_true, y_pred):
    TN = np.sum(np.logical_and(y_pred == 0, y_true == 0))

    N = len(y_true) -np.sum(y_true)
    
    return TN/N

def calculate_stroma_score(df, allCuts=None, allWeights=None):
#     df = pd.read_excel(data_dir)
    no_rows = len(df)
    classes_col = ["ADI", "BACK", "DEB", "LYM", "MUC", "MUS", "NORM", "STR", "TUM"]

    x = df.to_numpy()
    x[:, -1] = x[:, -1] / 365.35  # convert 'days to event' to 'years to event'
    x[:, 11] = x[:, 11] / 10  # convert 'years to birth' to 'decades to birth'
    x = np.append(x, np.zeros((no_rows, 1)), axis=1)  # add column for HD score

    if allWeights is None:
        # Calculate allWeights
        df["years_to_event"] = (
            df["days_to_event"] / 365.25
        )  # convert 'days to event' to 'years to event'
        df["decades_to_birth"] = (
            df["years_to_birth"] / 10
        )  # convert 'years to birth' to 'decades to birth'
        y = df[["years_to_event", "vital_status"]]

        cph_models = [
            CoxPHFitter().fit(
                pd.concat([df[col], y], axis=1), "years_to_event", "vital_status"
            )
            for col in classes_col
        ]
        allWeights = np.array([float(cph.summary["exp(coef)"]) for cph in cph_models])

        
        
    # Calculate allCuts?
    # allCuts = np.array(
    #     [
    #         0.00056,
    #         0.00227,
    #         0.03151,
    #         0.00121,
    #         0.01123,
    #         0.02359,
    #         0.06405,
    #         0.00122,
    #         0.99961,
    #     ]
    # )  # Youden cuts
    
    if allCuts==None:

        allCuts = []

        for i, col_name in enumerate(classes_col):
            class_score = df[col_name]
            max_index = 0
            median = np.median(class_score)
            for j, score in enumerate(class_score):
                if j == 0:
                    allCuts.append(score)

                preds = np.greater_equal(class_score, score).astype(int)
                sens = recall_score(df["vital_status"], preds)
                spec = specificity(df["vital_status"], preds)
                if (sens + spec) > max_index:
                    max_index = sens + spec
                    allCuts[i] = score
                elif sens + spec == max_index:
                    if abs(score - median) < abs(allCuts[i] - median):
                        allCuts[i] = score
                        max_index = score

    # Calculate stroma score
    scoreIndices = (np.argwhere(allWeights >= 1)).flatten()
    for i in scoreIndices:
        x[:, -1] = (
            x[:, -1] + (x[:, i + 1] >= allCuts[i]) * allWeights[i]
        )  # +1 retrieve column number in x
    medianTrainingSet = np.median(x[:, -1])
    x[:, -1] = (x[:, -1] >= medianTrainingSet) * 1
    stroma_score = x[:, -1]

    return stroma_score, allCuts, allWeights


## Original excel

In [3]:
df = pd.read_excel("../data/TCGA_MEASUREMENTS.xlsx")
df["years_to_event"] = df["days_to_event"]/365.25
df["decades_to_birth"] = df["years_to_birth"]/10

In [4]:
classes_col = ["ADI","BACK", "DEB", "LYM", "MUC","MUS", "NORM", "STR","TUM"]

In [5]:
y = df[["years_to_event","vital_status"]]
y_days = df[["days_to_event","vital_status"]]

In [6]:
df.head()

Unnamed: 0,ID,ADI,BACK,DEB,LYM,MUC,MUS,NORM,STR,TUM,...,histological_type,hypermutated,methylation_subtype,CAF_SCORE,percent_stromal_cells,RF_predictedCMS,cleanstage,days_to_event,years_to_event,decades_to_birth
0,TCGA-CM-6675,0.000284,0.000204,0.090979,0.000544,0.013228,0.007923,0.101498,0.086859,0.798861,...,colon adenocarcinoma,0.0,CIMP-H,2.080628,12,CMS1,4.0,397,1.086927,3.5
1,TCGA-AY-A8YK,0.000324,0.00029,0.004827,0.013253,0.004651,0.002694,0.066702,0.140175,0.767085,...,colon adenocarcinoma,,,1.635184,7,,4.0,573,1.568789,4.4
2,TCGA-CM-4747,0.001219,0.004085,0.197126,0.337597,0.002646,0.003099,0.158479,0.447476,0.485696,...,colon adenocarcinoma,0.0,CIMP-L,2.024608,15,,4.0,761,2.083504,4.7
3,TCGA-DY-A1DG,0.003772,0.001362,0.188463,0.002173,0.012698,0.063342,0.096374,0.012094,0.619722,...,rectal adenocarcinoma,,Cluster3,0.99005,0,,4.0,1566,4.287474,7.5
4,TCGA-CM-5862,0.007687,0.006287,0.386051,0.08692,0.132158,0.130372,0.076991,0.102107,0.481279,...,colon adenocarcinoma,0.0,Cluster3,1.944954,0,CMS2,4.0,153,0.418891,8.0


In [7]:
selected_columns = ["obtained_scores" ,"cleanstage", "gender", "decades_to_birth" ]
df_mv = df.copy()
df_mv = df_mv.dropna(subset=["cleanstage", "decades_to_birth"])
y_mv = df_mv[["years_to_event","vital_status"]]

## Our values

In [8]:
df_avg = pd.read_csv("../data/TCGA_SA_data_average.csv",)

df_avg["years_to_event"] = df_avg["days_to_event"]/365.25 # convert 'days to event' to 'years to event'
df_avg["decades_to_birth"] = df_avg["years_to_birth"]/10 # 
df_avg = df_avg[df_mv.columns]
df_avg = df_avg.dropna(subset=["cleanstage", "decades_to_birth"])

df_avg_train = df_avg.sample(frac=0.8, random_state=SEED)
df_avg_test = df_avg.drop(df_avg_train.index)

In [9]:
df_org_train = df_mv.loc[df_mv['ID'].isin(df_avg_train["ID"])]
df_org_test = df_mv.loc[df_mv['ID'].isin(df_avg_test["ID"])]

y_org_train = df_org_train[["years_to_event","vital_status"]]
y_org_test = df_org_test[["years_to_event","vital_status"]]

y_avg_train = df_avg_train[["years_to_event","vital_status"]]
y_avg_test = df_avg_test[["years_to_event","vital_status"]]

In [10]:
df_avg_train["obtained_scores"],allCuts,allWeights = calculate_stroma_score(df_avg_train)
df_avg_test["obtained_scores"],_,_ = calculate_stroma_score(df_avg_test, allCuts, allWeights)

In [11]:
df_org_train["obtained_scores"], allCuts, allWeights = calculate_stroma_score(df_org_train)
df_org_test["obtained_scores"],_,_ = calculate_stroma_score(df_org_test, allCuts, allWeights)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["years_to_event"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["decades_to_birth"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_org_train["obtained_scores"], allCuts, allWeights = calculate_stroma_score(df_org_train)
A value is trying to be set on a copy of a slice from a DataF

In [12]:
# print(mv_regression_df)
mv_cox_orginal_data = CoxPHFitter().fit(pd.concat([df_org_train[selected_columns], y_org_train], axis=1), "years_to_event", "vital_status", 
                           formula = "obtained_scores + cleanstage + C(gender) +decades_to_birth" )


In [13]:
mv_cox_orginal_data.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'years_to_event'
event col,'vital_status'
baseline estimation,breslow
number of observations,270
number of events observed,57
partial log-likelihood,-251.41
time fit was run,2022-12-08 15:03:52 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
C(gender)[T.male],-0.31,0.74,0.27,-0.83,0.22,0.44,1.24,0.0,-1.15,0.25,1.99
cleanstage,0.98,2.67,0.16,0.66,1.3,1.93,3.69,0.0,5.97,<0.005,28.65
decades_to_birth,0.47,1.6,0.12,0.24,0.7,1.27,2.02,0.0,3.96,<0.005,13.69
obtained_scores[T.1],1.02,2.78,0.35,0.33,1.71,1.4,5.55,0.0,2.91,<0.005,8.12

0,1
Concordance,0.79
Partial AIC,510.82
log-likelihood ratio test,60.27 on 4 df
-log2(p) of ll-ratio test,38.51


In [14]:
concordance_results = {}
org_c = concordance_index(df_org_test['years_to_event'], 
                  -mv_cox_orginal_data.predict_partial_hazard(df_org_test), df_org_test['vital_status'])

concordance_results["original"] = org_c

In [15]:
mv_cox_avg = CoxPHFitter().fit(pd.concat([df_avg_train[selected_columns], y_avg_train], axis=1), "years_to_event", "vital_status", 
                           formula = "obtained_scores + cleanstage + C(gender) +decades_to_birth" )
concordance_results["avg"] = concordance_index(df_avg_test['years_to_event'], 
                  -mv_cox_avg.predict_partial_hazard(df_avg_test), df_avg_test['vital_status'])

In [16]:
mv_cox_avg.print_summary()
print(concordance_results)

0,1
model,lifelines.CoxPHFitter
duration col,'years_to_event'
event col,'vital_status'
baseline estimation,breslow
number of observations,270
number of events observed,57
partial log-likelihood,-256.48
time fit was run,2022-12-08 15:03:53 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
C(gender)[T.male],-0.25,0.78,0.27,-0.77,0.28,0.46,1.32,0.0,-0.92,0.36,1.49
cleanstage,0.98,2.66,0.16,0.66,1.29,1.94,3.64,0.0,6.11,<0.005,29.92
decades_to_birth,0.46,1.58,0.12,0.22,0.69,1.24,2.0,0.0,3.75,<0.005,12.47
obtained_scores[T.1],-0.1,0.91,0.27,-0.63,0.43,0.53,1.54,0.0,-0.36,0.72,0.47

0,1
Concordance,0.78
Partial AIC,520.96
log-likelihood ratio test,50.12 on 4 df
-log2(p) of ll-ratio test,31.45


{'original': 0.5877862595419847, 'avg': 0.6049618320610687}


## Our data - highest probability

In [17]:
df_highest = pd.read_csv("../data/TCGA_SA_data_highest_tum.csv",)

df_highest["years_to_event"] = df_highest["days_to_event"]/365.25 # convert 'days to event' to 'years to event'
df_highest["decades_to_birth"] = df_highest["years_to_birth"]/10 # 
df_highest = df_highest[df_mv.columns]
df_highest = df_highest.dropna(subset=["cleanstage", "decades_to_birth"])

df_highest_train = df_highest.loc[df_highest['ID'].isin(df_avg_train["ID"])]
df_highest_test = df_highest.loc[df_highest['ID'].isin(df_avg_test["ID"])]

df_highest_train["obtained_scores"], allCuts, allWeights = calculate_stroma_score(df_highest_train)
df_highest_test["obtained_scores"],_,_ = calculate_stroma_score(df_avg_test,allCuts, allWeights)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["years_to_event"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["decades_to_birth"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_highest_train["obtained_scores"], allCuts, allWeights = calculate_stroma_score(df_highest_train)
A value is trying to be set on a copy of a slice from

In [18]:
y_highest  = df_highest[["years_to_event","vital_status"]]

In [19]:
mv_cox_highest = CoxPHFitter().fit(df_avg_train, "years_to_event", "vital_status", 
                           formula = "obtained_scores + cleanstage + C(gender) +decades_to_birth" )


In [20]:
mv_cox_highest.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'years_to_event'
event col,'vital_status'
baseline estimation,breslow
number of observations,270
number of events observed,57
partial log-likelihood,-256.48
time fit was run,2022-12-08 15:03:58 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
C(gender)[T.male],-0.25,0.78,0.27,-0.77,0.28,0.46,1.32,0.0,-0.92,0.36,1.49
cleanstage,0.98,2.66,0.16,0.66,1.29,1.94,3.64,0.0,6.11,<0.005,29.92
decades_to_birth,0.46,1.58,0.12,0.22,0.69,1.24,2.0,0.0,3.75,<0.005,12.47
obtained_scores[T.1],-0.1,0.91,0.27,-0.63,0.43,0.53,1.54,0.0,-0.36,0.72,0.47

0,1
Concordance,0.78
Partial AIC,520.96
log-likelihood ratio test,50.12 on 4 df
-log2(p) of ll-ratio test,31.45


In [21]:
concordance_results["highest"] = concordance_index(df_highest_test['years_to_event'], 
                  -mv_cox_highest.predict_partial_hazard(df_highest_test), df_highest_test['vital_status'])

In [22]:
print(concordance_results)

{'original': 0.5877862595419847, 'avg': 0.6049618320610687, 'highest': 0.6164122137404581}


## Only raw probabilities from model

In [25]:
cols_with_classes = ["cleanstage", "gender", "decades_to_birth", "ADI", "BACK", "DEB", "LYM", "MUC", "MUS", "NORM", "STR", "TUM"]
mv_cox_raw = CoxPHFitter().fit(df_org_train, "years_to_event", "vital_status", 
                           formula = "ADI + BACK + DEB + LYM + MUC+ MUS + NORM + STR + TUM + cleanstage + C(gender) +decades_to_birth" )

In [26]:
mv_cox_raw.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'years_to_event'
event col,'vital_status'
baseline estimation,breslow
number of observations,270
number of events observed,57
partial log-likelihood,-248.22
time fit was run,2022-12-08 15:07:41 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
ADI,-0.47,0.63,4.18,-8.65,7.72,0.0,2256.01,0.0,-0.11,0.91,0.13
BACK,-47.99,0.0,32.82,-112.32,16.35,0.0,12600000.0,0.0,-1.46,0.14,2.8
C(gender)[T.male],-0.33,0.72,0.29,-0.9,0.24,0.41,1.27,0.0,-1.15,0.25,1.99
DEB,2.49,12.06,1.18,0.19,4.79,1.2,120.79,0.0,2.12,0.03,4.87
LYM,0.01,1.01,0.92,-1.79,1.8,0.17,6.06,0.0,0.01,1.00,0.01
MUC,-0.43,0.65,1.55,-3.47,2.61,0.03,13.57,0.0,-0.28,0.78,0.36
MUS,0.26,1.3,1.0,-1.7,2.23,0.18,9.3,0.0,0.26,0.79,0.34
NORM,0.26,1.3,1.0,-1.71,2.23,0.18,9.32,0.0,0.26,0.79,0.33
STR,-0.79,0.45,1.22,-3.18,1.6,0.04,4.97,0.0,-0.65,0.52,0.95
TUM,-1.38,0.25,0.76,-2.87,0.1,0.06,1.11,0.0,-1.82,0.07,3.88

0,1
Concordance,0.80
Partial AIC,520.45
log-likelihood ratio test,66.64 on 12 df
-log2(p) of ll-ratio test,29.46


In [27]:
concordance_results["raw_probs"] = concordance_index(df_org_test['years_to_event'], 
                  -mv_cox_raw.predict_partial_hazard(df_org_test), df_org_test['vital_status'])
print(concordance_results)

{'original': 0.5877862595419847, 'avg': 0.6049618320610687, 'highest': 0.6164122137404581, 'raw_probs': 0.6164122137404581}
