In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from pathlib import Path
from IPython.display import display
pd.options.display.max_columns = None
from sklearn.preprocessing import StandardScaler

### This created a full dataframe with each row a slide and all metadata for the TCGA dataset. From here all tasks can be defined. 

In [2]:
# Patient spatus and genomic features.  What Zsofi gave us from TCGA.
status = pd.read_csv('../../data/TCGA_metadata/TCGA_OV_HRDstatus.txt', sep='\t', index_col=0)
len(status)

425

In [3]:
df_drugs = pd.read_csv("/mnt/ncshare/ozkilim/BRCA/data/TCGA_metadata/hnsc_lusc_luad_ov_breast_drugs.txt", sep='\t')
df_drugs["pharmaceutical_therapy_drug_name"] = df_drugs["Drug"]
df_responces = pd.read_csv("/mnt/ncshare/ozkilim/BRCA/data/TCGA_metadata/panTCGA_drug_treatment.txt", sep='\t') #All TCGA responces
df = pd.read_excel("/mnt/ncshare/ozkilim/BRCA/data/TCGA_metadata/TCGA-CDR-SupplementalTableS1.xlsx")
df_OV = df[df["type"] == "OV"]
print(len(df_OV))
df_OV["bcr_patient_barcode"].value_counts() # only single patients... 

587


TCGA-04-1331    1
TCGA-29-1702    1
TCGA-25-2401    1
TCGA-25-2404    1
TCGA-25-2408    1
               ..
TCGA-13-A5FU    1
TCGA-20-0987    1
TCGA-20-0990    1
TCGA-20-0991    1
TCGA-WR-A838    1
Name: bcr_patient_barcode, Length: 587, dtype: int64

In [4]:
merged_ov = pd.merge(df_OV, df_responces, on="bcr_patient_barcode", how="inner")
print(len(merged_ov["bcr_patient_barcode"].unique())) #530 unique patients with resonce data... 
print(len(merged_ov))

530
2515


In [5]:
all_merged = pd.merge(merged_ov, df_drugs, on="pharmaceutical_therapy_drug_name", how="inner") #here we loose 7 patients... they dont have drug types?

### Aggregation to give platina +1 if the patient got at least one platin drug

In [6]:
# Define an aggregation dictionary for all columns except 'bcr_patient_barcode'
aggregation_functions = {}
for column in all_merged.columns:
    if column != 'bcr_patient_barcode':
        if column == 'platina':  # Assuming 'platina' needs to use 'first'
            aggregation_functions[column] = 'max'
        else:
            aggregation_functions[column] = 'first' # Or other appropriate function

# Group by 'bcr_patient_barcode' and aggregate using the defined functions
aggregated_patients_df = all_merged.groupby('bcr_patient_barcode', as_index=False).agg(aggregation_functions)
# Remove patients who never got any platin based drugs 
aggregated_patients_df = aggregated_patients_df.loc[aggregated_patients_df['platina'] != 0]
# rename patients ids column for merging 
aggregated_patients_df = aggregated_patients_df.rename(columns={"bcr_patient_barcode": "PatientID"})

aggregated_patients_df = aggregated_patients_df[aggregated_patients_df['DSS'].notna()]

print(len(aggregated_patients_df))

503


### Number of patients with platinum treatment responce data (DSS) : 503
### Number of patients with HRD data : 425
### Number of intersecting patients: 366

In [7]:
all_labels = pd.merge(aggregated_patients_df, status, on="PatientID", how="inner") # find intersection of patients with genomic and platin data ...
all_labels = all_labels.drop(columns=["Unnamed: 0_x", "Unnamed: 0_y"])
print(len(all_labels))

366


In [8]:
paths = list(Path('/tank/WSI_data/Ovarian_WSIs/TCGA-OV').glob('*.svs'))
# paths[:5]
slide_info = np.array([['-'.join(path.name.split('-')[:3]), path.name.split('.')[0][-3:]] for path in paths])
slide_names, slide_types = slide_info[:, 0], slide_info[:, 1]
# paths #get path name from these ....
slide_names = np.array(['-'.join(path.name.split('-')[:10])[:-4] for path in paths])

### Make each row a silde ID with the rest of the solumn with the labvels for that patient.

In [9]:
# update the dataframe with the slide names corresponding to the status file
df = all_labels.copy()

df['slide_paths'] = [None] * len(df)
df['slide_types'] = [None] * len(df)

for ind, row in df.iterrows():
    df.loc[ind, 'slide_paths'] = ','.join([str(path).split('/')[-1].replace('.svs', '') for path in paths if row['PatientID'] in str(path)])
    df.loc[ind, 'slide_types'] = ','.join([slide_type for name, slide_type in zip(slide_names, slide_types) if row['PatientID'] in name])

In [None]:
df['slide_paths'] = df['slide_paths'].str.split(',')
df['slide_types'] = df['slide_types'].str.split(',')

# Verify that the lists in 'a' and 'b' have the same length, this is crucial
assert all(df['slide_paths'].str.len() == df['slide_types'].str.len()), "Lists in 'slide_paths' and 'b' have different lengths."

# Explode 'a' and 'b' simultaneously by combining them into a list of tuples
df['combined'] = df.apply(lambda x: list(zip(x['slide_paths'], x['slide_types'])), axis=1)

# Now explode the 'combined' column
df_exploded = df.explode('combined')

# Split the 'combined' tuples into separate columns
df_exploded[['slide_paths', 'slide_types']] = pd.DataFrame(df_exploded['combined'].tolist(), index=df_exploded.index)

# Drop the 'combined' column as it's no longer needed
df_exploded = df_exploded.drop('combined', axis=1)

# Rename columns for CLAM compatability.
df_exploded = df_exploded.rename(columns={"PatientID":"case_id","slide_paths":"slide_id"})

In [None]:
# Standard scale all genmoic features.
scaler = StandardScaler()
genomic_features = ["Signature.1","Signature.2","Signature.3","Signature.5","Signature.8","Signature.13","Microhomology2","Microhomology2ratio","Del/ins-ratio","Del10-ratio","HRD-LOH","Telomeric.AI","LST","DBS2","DBS4","DBS5","DBS6","DBS9","SBS1","SBS2","SBS3","SBS5","SBS8","SBS13","SBS18","SBS26","SBS35","SBS38","SBS39","SBS40","SBS41","ID1","ID2","ID4","ID8","HRDetect"]
# log transform.  
# Log Transformation (adding a small value to avoid log(0))
df_exploded[genomic_features] = df_exploded[genomic_features].applymap(lambda x: np.log(x + 1))
# Standard Scaling
genomic_features_scaled = scaler.fit_transform(df_exploded[genomic_features])
df_exploded[genomic_features] = genomic_features_scaled

In [None]:
df_exploded.to_csv("/mnt/ncshare/ozkilim/BRCA/data/tasks/combined_genomic_plat_responce.csv",index=None)