# Import

In [2]:
dname = "tcga_cesc_os"

target_data = "mut"
cli_df = pd.read_csv(f"../datasets_csv/{dname}.csv")

cli_df = cli_df.drop_duplicates("case_id").drop(["slide_id", "group"], axis=1).reset_index(drop=True)
tests = pd.read_csv(f"../splits/{dname}/splits_0.csv")["test"].dropna().values
cli_df["split"] = [pd.NA] * len(cli_df)
cli_df.loc[cli_df["case_id"].isin(tests), "split"] = "test"
cli_df.loc[~cli_df["case_id"].isin(tests), "split"] = "train"

if target_data != "cli":
    target_df = pd.read_csv(f"../datasets_csv/{dname}_{target_data}.csv.zip", compression="zip")
    target_df = pd.merge(target_df, cli_df[["case_id", "split", "survival_months", "event"]], on="case_id")
else:
    target_df = cli_df

target_df.reset_index(drop=True, inplace=True)
target_df.shape, target_df.isna().any().any(), target_df["case_id"].duplicated().any(), target_df.columns.duplicated().any(), target_df.columns.isna().any()

((192, 14901), False, False, False, False)

# Preprocessing

In [3]:
# Train test split
train_df = target_df[target_df["split"] != "test"].drop(["split"], axis=1)
test_df = target_df[target_df["split"] == "test"].drop(["split"], axis=1)
train_df = train_df.drop_duplicates("case_id").reset_index(drop=True)
test_df = test_df.drop_duplicates("case_id").reset_index(drop=True)

train_ids = train_df[["case_id"]]
X_train = train_df.drop(["case_id", "event", "survival_months"], axis=1)
y_train = train_df["survival_months"]
y_train_event = train_df["event"]

test_ids = test_df[["case_id"]]
X_test = test_df.drop(["case_id", "event", "survival_months"], axis=1)
y_test = test_df["survival_months"]
y_test_event = test_df["event"]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((158, 14897), (158,), (34, 14897), (34,))

In [4]:
y_test.describe(), y_train.describe()

(count     34.000000
 mean      39.022647
 std       46.301964
 min        0.430000
 25%       12.695000
 50%       22.535000
 75%       43.642500
 max      209.430000
 Name: survival_months, dtype: float64,
 count    158.000000
 mean      36.527658
 std       40.318260
 min        0.070000
 25%       13.372500
 50%       23.900000
 75%       40.555000
 max      210.510000
 Name: survival_months, dtype: float64)

In [5]:
# Fill Missing Values with Median
indep_vars = X_train.columns
if X_train.isna().any().any():
    train_medians = X_train.median()
    for i, col in enumerate(indep_vars):
        if i % 1000 == 0:
            print(i, "/", len(indep_vars))
        if X_train[col].isna().any():
            X_train[col] = X_train[col].fillna(train_medians[col])
train_df = pd.concat([X_train, y_train_event, y_train], axis=1)
print(train_df.isna().any().any(), train_df.shape)

False (158, 14899)


In [6]:
X_train_all = X_train.copy() # for comparison

# Variance Filtering

In [7]:
def var_filter(X)
print("\tBefore: ", X_train.shape)
var_sel = VarianceThreshold(0.01)
X_train = var_sel.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=indep_vars[var_sel.get_support()])
print("\tAfter variance threshold: ", X_train.shape)
removed_cols = [col for col in indep_vars if col not in X_train.columns]
print("Removed:", len(removed_cols))

	Before:  (158, 14897)
	After variance threshold:  (158, 6694)
Removed: 8203


# Normalization

In [8]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_norm = sc.fit_transform(X_train)
X_train_norm = pd.DataFrame(X_train_norm, columns=X_train.columns)
print(X_train_norm.min().min(), X_train_norm.max().max())
train_df = pd.concat([X_train_norm, y_train, y_train_event], axis=1)
indep_vars = X_train_norm.columns
train_df

-0.7307397502013093 8.83176086632785


Unnamed: 0,A1CF_mut,A2M_mut,AACS_mut,AADACL2_mut,AADACP1_mut,AAK1_mut,AAMP_mut,AARS_mut,AASDH_mut,AASS_mut,...,ZWILCH_mut,ZWINT_mut,ZXDB_mut,ZYG11A_mut,ZYG11B_mut,ZZEF1_mut,ZZZ3_mut,snoU13_mut,survival_months,event
0,-0.113228,-0.113228,-0.113228,-0.113228,-0.113228,-0.139122,-0.113228,-0.113228,-0.139122,-0.113228,...,-0.139122,-0.161165,-0.113228,-0.161165,-0.113228,-0.139122,-0.113228,-0.180775,49.44,0
1,-0.113228,-0.113228,-0.113228,-0.113228,-0.113228,-0.139122,-0.113228,-0.113228,-0.139122,-0.113228,...,-0.139122,-0.161165,-0.113228,-0.161165,-0.113228,-0.139122,-0.113228,-0.180775,11.43,1
2,-0.113228,-0.113228,-0.113228,-0.113228,-0.113228,-0.139122,-0.113228,-0.113228,-0.139122,-0.113228,...,-0.139122,-0.161165,-0.113228,-0.161165,-0.113228,-0.139122,-0.113228,-0.180775,68.79,1
3,-0.113228,-0.113228,-0.113228,-0.113228,-0.113228,-0.139122,-0.113228,-0.113228,-0.139122,-0.113228,...,-0.139122,-0.161165,-0.113228,-0.161165,-0.113228,-0.139122,-0.113228,-0.180775,18.73,1
4,-0.113228,-0.113228,-0.113228,-0.113228,-0.113228,-0.139122,-0.113228,-0.113228,-0.139122,-0.113228,...,-0.139122,-0.161165,-0.113228,-0.161165,-0.113228,-0.139122,-0.113228,-0.180775,36.53,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,-0.113228,-0.113228,-0.113228,-0.113228,-0.113228,-0.139122,-0.113228,-0.113228,-0.139122,-0.113228,...,-0.139122,-0.161165,-0.113228,-0.161165,-0.113228,-0.139122,-0.113228,-0.180775,22.60,0
154,-0.113228,-0.113228,-0.113228,-0.113228,-0.113228,-0.139122,-0.113228,-0.113228,-0.139122,-0.113228,...,-0.139122,-0.161165,-0.113228,-0.161165,-0.113228,-0.139122,-0.113228,-0.180775,19.58,0
155,-0.113228,-0.113228,-0.113228,-0.113228,-0.113228,-0.139122,-0.113228,-0.113228,-0.139122,-0.113228,...,-0.139122,-0.161165,-0.113228,-0.161165,-0.113228,-0.139122,-0.113228,-0.180775,14.59,0
156,-0.113228,-0.113228,-0.113228,-0.113228,-0.113228,-0.139122,-0.113228,-0.113228,-0.139122,-0.113228,...,-0.139122,-0.161165,-0.113228,-0.161165,8.831761,-0.139122,-0.113228,-0.180775,93.92,1


# Correlation Filtering

In [9]:
class MultiCollinearityEliminator:
    def __init__(self, df, target, threshold):
        self.df = df
        if isinstance(target, pd.DataFrame):
            self.target = target
        else:
            self.target = pd.DataFrame(target)
        self.threshold = threshold

    def createCorrMatrix(self, include_target = False):
        if (include_target == True):
            df_target = pd.concat([self.df, self.target], axis=1)
            corrMatrix = df_target.corr(method='pearson', min_periods=30).abs()
        elif (include_target == False):
            corrMatrix = self.df.corr(method='pearson', min_periods=30).abs()
        return corrMatrix

    def createCorrMatrixWithTarget(self):
        corrMatrix = self.createCorrMatrix(include_target = True)
        corrWithTarget = pd.DataFrame(corrMatrix.loc[:,self.target.columns[0]]).drop([self.target.columns[0]], axis = 0).sort_values(by = self.target.columns[0])                    
        # print(corrWithTarget, '\n')
        return corrWithTarget

    
    def createCorrelatedFeaturesList(self):
        corrMatrix = self.createCorrMatrix(include_target = False)                          
        colCorr = []
        for column in corrMatrix.columns:
            for idx, row in corrMatrix.iterrows(): 
                if (row[column]>self.threshold) and (row[column]<1):
                    
                    if (idx not in colCorr):
                        colCorr.append(idx)
                        print(idx, column, row[column], '\n')
                    if (column not in colCorr):
                        colCorr.append(column)
        # print(colCorr, '\n')
        return colCorr

    def deleteFeatures(self, colCorr):
        corrWithTarget = self.createCorrMatrixWithTarget()                                  
        for idx, row in corrWithTarget.iterrows():
            # print(idx, '\n')
            if (idx in colCorr):
                self.df = self.df.drop(idx, axis =1)
                break
        return self.df

    def autoEliminateMulticollinearity(self):
        colCorr = self.createCorrelatedFeaturesList()                                       
        while colCorr != []:
            self.df = self.deleteFeatures(colCorr)
            colCorr = self.createCorrelatedFeaturesList()                                     
        return self.df

In [10]:
# corr > 0.7 indicates multicollinearity
# https://blog.clairvoyantsoft.com/correlation-and-collinearity-how-they-can-make-or-break-a-model-9135fbe6936a#:~:text=Multicollinearity%20is%20a%20situation%20where,indicates%20the%20presence%20of%20multicollinearity.
import matplotlib.pyplot as plt

X_train = train_df[indep_vars]
corr = train_df.corr().abs()

if len(indep_vars) < 1000:
#     if target_data == "cli":
#         ticks = ['Hispanic', 'Age', 'Stage', 'Asian',
#         'Black', 'White', 'Ex Biopsy', 'Cytology',
#         'Inc Biopsy', 'Resection', 'G1', 'G2',
#         'G3', 'G4', 'Stage I', 'Stage II',
#         'Stage III', 'Stage IV', 'time', 'event']
#         corr.columns = ticks
#         corr.index = ticks
#         plt.figure(figsize=(15, 13))
#         sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")

        # plt.show()
        # plt.savefig(f"./tcga_ov/figs/cli_hm{surv_type}.png")

    cor_sel = MultiCollinearityEliminator(df=X_train, target=y_train, threshold=.7)
    X_train = cor_sel.autoEliminateMulticollinearity()
    removed_cols2 = [i for i in indep_vars if i not in X_train.columns]
else:
    print((corr>=.7).sum().sum())
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    removed_cols2 = [column for column in upper.columns if any(upper[column] > .7)]
    print("Removed:", len(removed_cols2))
    X_train = X_train.drop(columns=removed_cols2, axis=1)
print("\tAfter collinearity elimination: ", X_train.shape)
train_df = pd.concat([X_train, y_train, y_train_event], axis=1)
indep_vars = X_train.columns

106564
Removed: 4334
	After collinearity elimination:  (158, 2360)


# Univariate Feature Selection

In [11]:
train_data = train_df.copy()

logrank_results = {}
sign_features = []
# Perform univariate analysis for each feature
for feature in X_train.columns:
    # Binarize or categorize the feature (e.g., by median)
    if len(X_train[feature].unique()) > 2:
        train_data["group"] = train_data[feature].apply(lambda x: x > train_data[feature].median())
    else:
        train_data["group"] = train_data[feature].apply(lambda x: x > 0)
    # Survival times and event indicators
    T = train_data['survival_months']
    E = train_data['event']

    # Compare survival distributions
    ix = train_data["group"]
    results = logrank_test(T[ix], T[~ix], event_observed_A=E[ix], event_observed_B=E[~ix])
    
    # logrank_results[feature] = str(round(results.p_value, 4))+"*" if results.p_value < 0.05 else results.p_value
    logrank_results[feature] = results.p_value
    if results.p_value < 0.05:
        sign_features.append(feature)
        # kmf = KaplanMeierFitter()
        # kmf.fit(T[ix], E[ix], label=f'{feature} high')
        # ax = kmf.plot()
        # kmf.fit(T[~ix], E[~ix], label=f'{feature} low')
        # kmf.plot(ax=ax)
        # handles, labels = ax.get_legend_handles_labels()

        # # Update labels to desired format
        # formatted_labels = [f'{feature.replace("_", " ").capitalize().replace("1.0", "II").replace("3.0", "IV")} low', 
        #                     '', 
        #                     f'{feature.replace("_", " ").capitalize().replace("1.0", "II").replace("3.0", "IV")} high', 
        #                     '']

        # # Reassign handles and labels to the legend, skipping the second and fourth items
        # plt.legend([handles[0], handles[1]], [formatted_labels[0], formatted_labels[2]])
        # plt.ylabel("S(t)", rotation=0,  fontsize=12, labelpad=15)
        # plt.xlabel("t", fontsize=12)
        # ax.text(0.75, 0.8, f'p-value = {results.p_value:.3f}', transform=ax.transAxes)
        # plt.savefig(f'./tcga_ov/figs/km_plot_{feature}_{dname}.png')

        # plt.close()
        
    # Store results
    

# Convert results to DataFrame for easier viewing
logrank_results_df = pd.DataFrame.from_dict(logrank_results, orient='index', columns=['p-value'])

print(logrank_results_df.sort_values("p-value"))
removed_cols3 = [i for i in indep_vars if i not in sign_features]
print("Removed: ", len(removed_cols3))
indep_vars = sign_features
X_train = X_train[indep_vars]
print(X_train.shape)
len(sign_features)

                  p-value
NUP107_mut   2.541644e-33
GABRG1_mut   7.811703e-18
FBXO33_mut   6.453147e-17
RBM26_mut    6.453147e-17
MAGEB10_mut  2.559625e-12
...                   ...
COL16A1_mut  9.974985e-01
BAI3_mut     9.997840e-01
ATP7A_mut    1.000000e+00
KLHL18_mut   1.000000e+00
SSH1_mut     1.000000e+00

[2360 rows x 1 columns]
Removed:  2154
(158, 206)


206

In [12]:
print(len(removed_cols), len(removed_cols2), len(removed_cols3), len(indep_vars))
max_length = max(len(removed_cols), len(removed_cols2), len(removed_cols3))

removed_cols_padded = np.pad(removed_cols, (0, max_length - len(removed_cols)), constant_values=np.nan)
removed_cols2_padded = np.pad(removed_cols2, (0, max_length - len(removed_cols2)), constant_values=np.nan)
removed_cols3_padded = np.pad(removed_cols3, (0, max_length - len(removed_cols3)), constant_values=np.nan)

removed_df = pd.DataFrame({
    "VarThresh": removed_cols_padded,
    "CollinReduced": removed_cols2_padded,
    "Univariate": removed_cols3_padded
})
removed_df

8203 4334 2154 206


Unnamed: 0,VarThresh,CollinReduced,Univariate
0,A1BG_mut,ABCC2_mut,A2M_mut
1,A1BG-AS1_mut,ABCG1_mut,AACS_mut
2,A2ML1_mut,ABR_mut,AADACL2_mut
3,A4GNT_mut,ACADS_mut,AADACP1_mut
4,AAAS_mut,ACAT1_mut,AAK1_mut
...,...,...,...
8198,ZXDA_mut,,
8199,ZYX_mut,,
8200,hsa-mir-1199_mut,,
8201,hsa-mir-7162_mut,,


In [13]:
removed_df.to_csv("./tmp_removed_os.csv", index=False)

# Feature Importance

In [14]:
y = pd.concat([y_train, y_train_event], axis=1)

cph = CoxPHFitter(penalizer=.1)
cph.fit(pd.concat([X_train, y], axis=1), 'survival_months', 'event')

# Calculate SHAP values
explainer = shap.Explainer(cph.predict_partial_hazard, X_train, max_evals=2 * X_train.shape[1] + 1)
shap_values = explainer(X_train)

# Calculate mean absolute SHAP values for feature importance
feature_importance = np.abs(shap_values.values).mean(axis=0)

# Create a DataFrame for feature importance
feature_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': feature_importance
})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False).reset_index(drop=True)
feature_importance_df



>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'ACTN4_mut'].var())
>>> print(df.loc[~events, 'ACTN4_mut'].var())

A very low variance means that the column ACTN4_mut completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.

PermutationExplainer explainer: 159it [00:12,  3.13it/s]                         


Unnamed: 0,feature,importance
0,PDE11A_mut,1.995404e+17
1,MAGEB10_mut,1.495060e+17
2,CALML3-AS1_mut,8.620886e+15
3,C11orf65_mut,8.111681e+15
4,ARL15_mut,6.955534e+15
...,...,...
201,CEP41_mut,3.724481e+00
202,TYSND1_mut,2.360922e+00
203,FBXO33_mut,1.236677e+00
204,TBC1D24_mut,3.436872e-01


In [15]:
feature_importance_df.to_csv("./tmp_feat_os.csv", index=False)

In [16]:
# # Plot the SHAP summary plot (optional)
# shap.summary_plot(shap_values, X_train, feature_names=X_train.columns)

# Cross Validation

In [17]:
def cross_validate_survival_model(X, y, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    c_index_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Combine features and target for CoxPHFitter
        train_data = pd.concat([X_train, y_train], axis=1)
        test_data = pd.concat([X_test, y_test], axis=1)
        
        # Fit the Cox Proportional Hazards model
        cph = CoxPHFitter(penalizer=0.1)
        cph.fit(train_data, 'survival_months', 'event')
        
        # Predict partial hazard for test set
        partial_hazard = cph.predict_partial_hazard(test_data)
        
        # Calculate concordance index
        c_index = concordance_index(test_data['survival_months'], -partial_hazard, test_data['event'])
        c_index_scores.append(c_index)
    
    return np.mean(c_index_scores), np.std(c_index_scores)


In [18]:
results = []
print_every = 50 if X_train.shape[1] > 100 else 10
for i in range(1, X_train.shape[1]+1):
    selected_features = feature_importance_df.loc[:i, "feature"].values
    X = X_train[selected_features]
    y = pd.concat([y_train, y_train_event], axis=1)

    # Perform cross-validation
    mean_c_index, std_c_index = cross_validate_survival_model(X, y)
    results.append((i, mean_c_index, std_c_index))

    if i % print_every == 0:
        print(f'Number of features: {i} | Cross-validated C-index: {mean_c_index:.4f} ± {std_c_index:.4f}')


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'PDE11A_mut'].var())
>>> print(df.loc[~events, 'PDE11A_mut'].var())

A very low variance means that the column PDE11A_mut completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'PDE11A_mut'].var())
>>> print(df.loc[~events, 'PDE11A_mut'].var())

A very low variance means that the column PDE11A_mut completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'ARL15_mut'].var())
>>> print(df.loc[~events, 'ARL15_mut'].var())

A very low variance means that the column ARL15_mut completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-d

Number of features: 50 | Cross-validated C-index: 0.6260 ± 0.1060






>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'AIMP2_mut'].var())
>>> print(df.loc[~events, 'AIMP2_mut'].var())

A very low variance means that the column AIMP2_mut completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.













>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'AIMP2_mut'].var())
>>> print(df.loc[~events, 'AIMP2_mut'].var())

A very low variance means that the column AIMP2_mut completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.





















>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'AIMP2_mut'].var())
>>> print(df.loc[~events, 'AIMP2_mut'].var())

A very low variance means that the column AIMP2_mut completely determines whether a subject dies or not. See https://stats.stackexchange.c

Number of features: 100 | Cross-validated C-index: 0.7148 ± 0.1660




>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'ABHD13_mut'].var())
>>> print(df.loc[~events, 'ABHD13_mut'].var())

A very low variance means that the column ABHD13_mut completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.





>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'ABHD13_mut'].var())
>>> print(df.loc[~events, 'ABHD13_mut'].var())

A very low variance means that the column ABHD13_mut completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.




>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'ABHD13_mut'].var())
>>> print(df.loc[~events, 'ABHD13_mut'].var())

A very low variance means that the column ABHD13_mut completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109

KeyboardInterrupt: 

In [None]:
# pd.DataFrame(results, columns=["Number of features", "Mean C-index", "Std C-index"]).to_csv("./tmp_results.csv", index=False)

In [None]:
# # Define your feature set and target
# X = X_train_all.drop(removed_cols, axis=1)  # Use selected features
# y = pd.concat([y_train, y_train_event], axis=1)

# # Perform cross-validation
# mean_c_index_all, std_c_index_all = cross_validate_survival_model(X, y)
# print(f'Cross-validated C-index: {mean_c_index_all:.4f} ± {std_c_index_all:.4f}')
# results.append([X.shape[1], mean_c_index_all, std_c_index_all])

In [None]:
results_df = pd.DataFrame(results, columns=["Number of features", "Mean C-index", "Std C-index"])
results_df

Unnamed: 0,Number of features,Mean C-index,Std C-index
0,1,0.526784,0.035176
1,2,0.526784,0.035176


In [None]:
results_df[results_df["Mean C-index"] == results_df["Mean C-index"].max()]

Unnamed: 0,Number of features,Mean C-index,Std C-index
0,1,0.526784,0.035176
1,2,0.526784,0.035176


# Save

In [None]:
with pd.ExcelWriter(f'./tcga_cesc/{dname}_fs_{target_data}.xlsx') as writer:
        removed_df.to_excel(writer, sheet_name="Filtering")
        logrank_results_df.to_excel(writer, sheet_name="LogRank")
        feature_importance_df.to_excel(writer, sheet_name="SHAP")
        results_df.to_excel(writer, sheet_name="CV")

# Combine Results

In [1]:
import pandas as pd

selected_features = {}
dname = "tcga_ov_dfs"
for gen in ["rna", "mut", "dna", "cnv", "pro"]:
    with pd.ExcelFile(f'./tcga_ov/{dname}_fs_{gen}.xlsx') as ef:
        results_df = pd.read_excel(ef, sheet_name="CV")
        feature_importance_df = pd.read_excel(ef, sheet_name="SHAP")
    max_cv = results_df[results_df["Mean C-index"] == results_df["Mean C-index"].max()]
    print(f"For {gen} max CV: ", max_cv)
    nb_of_features = max_cv["Number of features"].item()
    selected_features[gen] = feature_importance_df.iloc[:nb_of_features]["feature"].values
for k, v in selected_features.items():
    print(k, len(v))

For rna max CV:       Unnamed: 0  Number of features  Mean C-index  Std C-index
202         202                 203      0.778369     0.071906
For mut max CV:      Unnamed: 0  Number of features  Mean C-index  Std C-index
35          35                  36      0.676401     0.045565
For dna max CV:      Unnamed: 0  Number of features  Mean C-index  Std C-index
57          57                  58      0.695239      0.01377
For cnv max CV:     Unnamed: 0  Number of features  Mean C-index  Std C-index
6           6                   7      0.562316     0.042915
For pro max CV:     Unnamed: 0  Number of features  Mean C-index  Std C-index
8           8                   9      0.630536     0.030632
rna 203
mut 36
dna 58
cnv 7
pro 9


In [2]:
combined = pd.read_csv(f"../datasets_csv/{dname}.csv")
for gen in ["rna", "mut", "dna", "cnv", "pro"]:
    df = pd.read_csv(f"../datasets_csv/{dname}_{gen}.csv.zip", compression="zip")
    combined = pd.merge(combined, df[["case_id"]+list(selected_features[gen])], how="outer", on="case_id")
combined

Unnamed: 0,slide_id,case_id,age,event,survival_months,group,stage_binary,race_asian,race_black_or_aa,race_white,...,ABCC8_cnv,SERPINE1|PAI-1_pro,PARP1|PARP1_pro,GAPDH|GAPDH_pro,SMAD4|Smad4_pro,HSPA1A|HSP70_pro,PGR|PR_pro,BAD|Bad_pS112_pro,BIRC2 |cIAP_pro,JUN|c-Jun_pS73_pro
0,TCGA-23-1120-01Z-00-DX1.59367B12-17F1-41AA-A6F...,TCGA-23-1120,60,0,4.27,0.0,1.0,0,0,1,...,-1.0,,,,,,,,,
1,TCGA-23-1120-01A-02-BS2.4cbf84a4-9b8f-4448-a06...,TCGA-23-1120,60,0,4.27,0.0,1.0,0,0,1,...,-1.0,,,,,,,,,
2,TCGA-23-1120-01A-01-BS1.7a9f0eb0-59d5-46ee-ae3...,TCGA-23-1120,60,0,4.27,0.0,1.0,0,0,1,...,-1.0,,,,,,,,,
3,TCGA-23-2647-01Z-00-DX1.21E5D0D8-6BA8-4D49-BA7...,TCGA-23-2647,49,0,4.43,0.0,1.0,1,0,0,...,,1.191639,-0.315083,0.205542,-0.289535,0.495657,-0.515445,0.537468,0.324704,0.082208
4,TCGA-23-2647-01A-01-BS1.6b554f23-4ac8-4798-944...,TCGA-23-2647,49,0,4.43,0.0,1.0,1,0,0,...,,1.191639,-0.315083,0.205542,-0.289535,0.495657,-0.515445,0.537468,0.324704,0.082208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1031,TCGA-04-1654-01A-01-BS1.f232c15b-e101-4cce-b78...,TCGA-04-1654,69,1,26.54,1.0,1.0,0,0,1,...,-1.0,-0.480076,-0.070193,1.009002,-0.086350,-1.143953,-0.462810,0.514392,0.104011,-0.153630
1032,TCGA-30-1860-01A-01-BS1.3f2f52d2-e19d-4b63-9bf...,TCGA-30-1860,58,1,13.04,1.0,1.0,0,0,1,...,-1.0,2.650477,-0.676543,-0.210584,-0.110996,-0.936343,-0.718805,0.344610,0.139261,0.123134
1033,TCGA-13-1511-01A-01-TS1.4fb08454-15fa-40f8-9d2...,TCGA-13-1511,52,1,15.08,1.0,1.0,1,0,0,...,0.0,0.061888,-0.595634,2.560458,-0.082128,-0.398623,-0.614897,-0.580904,0.347436,-0.171562
1034,TCGA-13-1511-01A-01-BS1.1db24a7e-5116-4fad-a83...,TCGA-13-1511,52,1,15.08,1.0,1.0,1,0,0,...,0.0,0.061888,-0.595634,2.560458,-0.082128,-0.398623,-0.614897,-0.580904,0.347436,-0.171562


In [3]:
case_df = combined.drop_duplicates("case_id")
case_df.isna().any().sum()

314

In [17]:
combined.to_csv(f"../datasets_csv/{dname}.csv", index=False)

# RENAME

In [40]:
import pandas as pd
dname = "tcga_ov_os"
df = pd.read_csv(f"../datasets_csv/{dname}.csv")
case_df = df.drop_duplicates("case_id")
print(df.shape, case_df.shape)
df.insert(3, "censorship", 1-df["event"])
cli_cols = [col for col in df.columns if col not in ["case_id", "slide_id", "censorship", "event", "survival_months", "group"]+[c for c in df.columns if c[-3:] in ["rna", "pro", "dna", "cnv", "mut"]]]
print(cli_cols)
df.rename(columns={col: col+"_cli" for col in cli_cols}, inplace=True)
print(df.columns)
df[["event", "censorship"]]

(1304, 30) (545, 30)
['menopause', 'height', 'weight', 'history_other_malignancy', 'age', 'open_sx', 'peritoneal_washing', 'tumor_invasion_percent', 'stage_binary', 'race_asian', 'race_black_or_aa', 'race_white', 'biopsy_excision', 'biopsy_other', 'biopsy_resection', 'residual_tumor_0', 'residual_tumor_1', 'residual_tumor_2', 'clinical_stage_0', 'clinical_stage_1', 'clinical_stage_2', 'clinical_stage_3', 'grade_0', 'grade_1', 'grade_2']
Index(['slide_id', 'case_id', 'menopause_cli', 'censorship', 'height_cli',
       'weight_cli', 'history_other_malignancy_cli', 'age_cli', 'open_sx_cli',
       'peritoneal_washing_cli', 'tumor_invasion_percent_cli', 'event',
       'survival_months', 'group', 'stage_binary_cli', 'race_asian_cli',
       'race_black_or_aa_cli', 'race_white_cli', 'biopsy_excision_cli',
       'biopsy_other_cli', 'biopsy_resection_cli', 'residual_tumor_0_cli',
       'residual_tumor_1_cli', 'residual_tumor_2_cli', 'clinical_stage_0_cli',
       'clinical_stage_1_cli', 'cl

Unnamed: 0,event,censorship
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
1299,0,1
1300,0,1
1301,0,1
1302,0,1


In [47]:
label_cols = ["case_id", "slide_id", "censorship", "event", "survival_months", "group"]
df = df[label_cols+[col for col in df.columns if col not in label_cols]]
df.head()

Unnamed: 0,case_id,slide_id,censorship,event,survival_months,group,age_cli,stage_binary_cli,race_asian_cli,race_black_or_aa_cli,...,ABCC8_cnv,SERPINE1|PAI-1_pro,PARP1|PARP1_pro,GAPDH|GAPDH_pro,SMAD4|Smad4_pro,HSPA1A|HSP70_pro,PGR|PR_pro,BAD|Bad_pS112_pro,BIRC2 |cIAP_pro,JUN|c-Jun_pS73_pro
0,TCGA-23-1120,TCGA-23-1120-01Z-00-DX1,1,0,4.27,0.0,60,1.0,0,0,...,-1.0,,,,,,,,,
1,TCGA-23-1120,TCGA-23-1120-01A-02-BS2,1,0,4.27,0.0,60,1.0,0,0,...,-1.0,,,,,,,,,
2,TCGA-23-1120,TCGA-23-1120-01A-01-BS1,1,0,4.27,0.0,60,1.0,0,0,...,-1.0,,,,,,,,,
3,TCGA-23-2647,TCGA-23-2647-01Z-00-DX1,1,0,4.43,0.0,49,1.0,1,0,...,,1.191639,-0.315083,0.205542,-0.289535,0.495657,-0.515445,0.537468,0.324704,0.082208
4,TCGA-23-2647,TCGA-23-2647-01A-01-BS1,1,0,4.43,0.0,49,1.0,1,0,...,,1.191639,-0.315083,0.205542,-0.289535,0.495657,-0.515445,0.537468,0.324704,0.082208


In [48]:
df.to_csv(f"../datasets_csv/{dname}.csv", index=False)