In [2]:
### define all omics features sets and options to run in main experiment
# dict with omics list numbers set... 
import pandas as pd
import json
from collections import defaultdict

In [3]:
def get_all_prots(df):
    """
    Function to get all column names from a DataFrame where all letters in the column title are capital.
    
    Args:
    df (pd.DataFrame): The DataFrame from which to extract column names.

    Returns:
    List: A list of column names with all letters in capital.
    """
    # Filter and return columns where all letters in the column name are uppercase
    return [col for col in df.columns if col.isupper()]


In [4]:
# Chowdry 60 flat

chowdery_60 = ['RAB25', 'BCL2L1', 'HADH', 'NFKB2', 'COX7A2', 'COX7C', 'TPMT', 'GOLPH3L', 'LTA4H', 'COX6C', 'IDH1', 'YWHAG', 'S100A10', 'COX6A1', 'NDUFB3', 'TGM2', 'CDKN1B', 'NFKB1', 'CAMK2D', 'IL4I1', 'FDX1', 'VCAM1', 'ATM', 'NCAPH2', 'ABCB8', 'IDI1', 'PLIN2', 'ATP6V1D', 'GPX4', 'CA2', 'RELA', 'GLUD1', 'TOP3B', 'RPS6KB2', 'KEAP1', 'LGALS1', 'MTDH', 'AIFM1', 'RHOA', 'CASP7', 'PTGES2', 'TFRC', 'CHUK', 'GPX1', 'PDK1', 'STAT3', 'PECR', 'TALDO1', 'XIAP', 'ACADSB', 'CPOX', 'ARNT', 'BIRC2', 'ACOT7', 'HACL1', 'MYD88', 'EGFR', 'RIPK1', 'NBN', 'LDHA']

In [5]:
# Chowdry 60 grouped

chowdery_60_grouped = {
				"Drug Metabolism & Biological Oxidation": ["TPMT"],
				"Metabolic": [ "TALDO1",'CA2', "COX7A2", "LGALS1", "S100A10", "ACADSB", "COX6C", "COX7C", 
					"GPX1", "GPX4", "LDHA", "NDUFB3", "ATP6V1D", "ACOT7", "HACL1", 
					"CPOX", "PTGES2", "GLUD1", "COX6A1", "LTA4H", "CASP7", "IL4I1" , "PECR",
					"YWHAG", "IDI1", "AIFM1", "NBN", "HADH", "PLIN2", "FDX1", "NCAPH2", "IDH1", "ABCB8"
				],
				"Hypoxia": [
					"TGM2", "RAB25", "CDKN1B", "EGFR" , "RHOA", "NFKB1", 
					"PDK1", "RPS6KB2", "TFRC", "STAT3", "ARNT", "CAMK2D"
				],
				"NF-kB": [
					"RELA", "ATM", "BCL2L1", "BIRC2", "VCAM1", "NFKB2", "KEAP1", "RIPK1", "MTDH",
					"CHUK", "MYD88", "GOLPH3L", "TOP3B", "XIAP"
				]
			}

### TCGA clusters 

In [6]:
# Combine all options into large scale struct that can be called out during training... 

# Load the Excel file
excel_file = '../data/HGSOC_Zhang_TCGA_CPTAC_OV/1-s2.0-S0092867416306730-mmc5.xlsx'

# Read the names of the sheets in the Excel file
xls = pd.ExcelFile(excel_file)
sheet_names = xls.sheet_names
# Skip the first sheet
sheet_names = sheet_names[1:]
# Initialize an empty DataFrame
merged_df = pd.DataFrame()
for sheet_name in sheet_names:
    # Read each sheet
    df = pd.read_excel(excel_file, sheet_name=sheet_name)
    # Assume the first column is 'Proteins' and the second is 'Values'
    df = df.rename(columns={df.columns[0]: 'Proteins', df.columns[1]: sheet_name})
    if merged_df.empty:
        # For the first sheet, initialize the merged DataFrame
        merged_df = df
    else:
        # Merge with the existing DataFrame on 'Proteins'
        merged_df = pd.merge(merged_df, df, on='Proteins', how='outer')
# Replace NaN with 0 (assuming missing values can be treated as 0)
merged_df = merged_df.fillna(0)
# Find the column with the highest value for each protein
merged_df['Highest_Group'] = merged_df.drop('Proteins', axis=1).idxmax(axis=1)
merged_df.head()
print(len(merged_df))
print(merged_df["Highest_Group"].value_counts())
# now use these groups to split proteomics figure. Append as a column 

# Convert the DataFrame to a dictionary mapping proteins to groups
protein_to_group = dict(zip(merged_df['Proteins'], merged_df['Highest_Group']))



df = pd.read_csv("HGSOC_TCGA_main.csv",header=0,low_memory=False)

prots = get_all_prots(df)

# Assign each protein to its group
grouped_proteins = {protein: protein_to_group.get(protein, 'Unknown') for protein in prots}

# If you want to group the proteins by their groups

TCGA_grouped = defaultdict(list)
for protein, group in grouped_proteins.items():
    TCGA_grouped[group].append(protein)

# 'grouped' is now a dictionary with groups as keys and lists of proteins as values
print(dict(TCGA_grouped))
# print(grouped)
TCGA_grouped.pop("Unknown")
# remove unknown 
TCGA_grouped.keys()

1664
WGCNA_metabolism                  375
WGCNA_ECM interaction             325
WGCNA_cytokine signaling          316
WGCNA_DNA replication             249
WGCNA_complement cascade          171
WGCNA_cell-cell communications    142
WGCNA_erythrocyte and platelet     86
Name: Highest_Group, dtype: int64
{'WGCNA_complement cascade': ['A1BG', 'A2M', 'ADH5', 'AFM', 'AGT', 'AKR1C1', 'AKR1C3', 'ALB', 'ALDH1A1', 'AMBP', 'APCS', 'APOA1', 'APOA2', 'APOA4', 'APOB', 'APOC1', 'APOC3', 'APOD', 'APOE', 'APOH', 'APOL1', 'AZGP1', 'BANF1', 'C2', 'C3', 'C4A', 'C4B', 'C5', 'C6', 'C7', 'C8A', 'C8B', 'C8G', 'C9', 'CAMK1D', 'CCDC6', 'CDC37', 'CFB', 'CFH', 'CFHR1', 'CFHR2', 'CFHR3', 'CFHR5', 'CFI', 'CHL1', 'CLEC3B', 'CLPP', 'COL14A1', 'COL15A1', 'COL18A1', 'COL2A1', 'COL4A1', 'COL4A2', 'COL6A1', 'COL6A2', 'COL6A3', 'CP', 'CPXM1', 'CRP', 'CST3', 'DEF6', 'DNASE2', 'DPT', 'EMILIN1', 'ENY2', 'EPB41L2', 'F12', 'F2', 'F9', 'FBLN5', 'FMOD', 'GALK2', 'GATM', 'GC', 'HINT1', 'HMGCS1', 'HP', 'HPR', 'HPX', 'HRG', 'HYOU

dict_keys(['WGCNA_complement cascade', 'WGCNA_cell-cell communications', 'WGCNA_metabolism', 'WGCNA_erythrocyte and platelet', 'WGCNA_cytokine signaling', 'WGCNA_DNA replication', 'WGCNA_ECM interaction'])

In [7]:

df = pd.read_csv("HGSOC_TCGA_main.csv",header=0,low_memory=False)

prots = get_all_prots(df)

# Assign each protein to its group
grouped_proteins = {protein: protein_to_group.get(protein, 'Unknown') for protein in prots}

# If you want to group the proteins by their groups
from collections import defaultdict

TCGA_grouped = defaultdict(list)
for protein, group in grouped_proteins.items():
    TCGA_grouped[group].append(protein)

# 'grouped' is now a dictionary with groups as keys and lists of proteins as values
print(dict(TCGA_grouped))
# print(grouped)
TCGA_grouped.pop("Unknown")
# remove unknown 
TCGA_grouped.keys()

{'WGCNA_complement cascade': ['A1BG', 'A2M', 'ADH5', 'AFM', 'AGT', 'AKR1C1', 'AKR1C3', 'ALB', 'ALDH1A1', 'AMBP', 'APCS', 'APOA1', 'APOA2', 'APOA4', 'APOB', 'APOC1', 'APOC3', 'APOD', 'APOE', 'APOH', 'APOL1', 'AZGP1', 'BANF1', 'C2', 'C3', 'C4A', 'C4B', 'C5', 'C6', 'C7', 'C8A', 'C8B', 'C8G', 'C9', 'CAMK1D', 'CCDC6', 'CDC37', 'CFB', 'CFH', 'CFHR1', 'CFHR2', 'CFHR3', 'CFHR5', 'CFI', 'CHL1', 'CLEC3B', 'CLPP', 'COL14A1', 'COL15A1', 'COL18A1', 'COL2A1', 'COL4A1', 'COL4A2', 'COL6A1', 'COL6A2', 'COL6A3', 'CP', 'CPXM1', 'CRP', 'CST3', 'DEF6', 'DNASE2', 'DPT', 'EMILIN1', 'ENY2', 'EPB41L2', 'F12', 'F2', 'F9', 'FBLN5', 'FMOD', 'GALK2', 'GATM', 'GC', 'HINT1', 'HMGCS1', 'HP', 'HPR', 'HPX', 'HRG', 'HYOU1', 'IDUA', 'IGLL5', 'ITIH1', 'ITIH2', 'ITIH3', 'ITIH4', 'KCTD21', 'KLKB1', 'KNG1', 'LAMB1', 'LAMB2', 'LAMC1', 'LBP', 'LRG1', 'LTF', 'MAOA', 'MAOB', 'MAP2', 'MARCKSL1', 'MATN2', 'METTL7A', 'MFGE8', 'MINPP1', 'NCAM1', 'NCDN', 'NF1', 'NID1', 'NID2', 'NSDHL', 'NTMT1', 'NUDT5', 'OAT', 'OGN', 'OLFML1', 'ORM1'

dict_keys(['WGCNA_complement cascade', 'WGCNA_cell-cell communications', 'WGCNA_metabolism', 'WGCNA_erythrocyte and platelet', 'WGCNA_cytokine signaling', 'WGCNA_DNA replication', 'WGCNA_ECM interaction'])

In [8]:
TCGA_flat_1k = [item for sublist in TCGA_grouped.values() for item in sublist]

## Plat responce grouping 500 prots: 

In [9]:
plt_genes = pd.read_excel("/mnt/ncshare/ozkilim/BRCA/data/HGSOC_processed_data/plat_res_prots.xlsx",header=2)
main_data_df = pd.read_csv("/mnt/ncshare/ozkilim/BRCA/HGSOC_platinum_responce/HGSOC_TCGA_main.csv")

main_data_df.head()
main_data_columns = set(main_data_df.columns)
df = plt_genes[["HUGO Gene symbol","Putative mechanism associated with Pt-resistancea"]]

print(len(df))
# Filter the rows in df1
df = df[df['HUGO Gene symbol'].isin(main_data_columns)]
print(len(df))

# Split the 'Putative mechanism associated with Pt-resistance' into lists
df['Putative mechanism associated with Pt-resistancea'] = df['Putative mechanism associated with Pt-resistancea'].str.split(', ')

# Create a dictionary to hold the mechanisms and their associated proteins
mechanism_dict = {}

# Populate the dictionary
for index, row in df.iterrows():
    for mechanism in row['Putative mechanism associated with Pt-resistancea']:
        mechanism = mechanism.strip()  # Remove any leading/trailing whitespace
        if mechanism in mechanism_dict:
            mechanism_dict[mechanism].append(row['HUGO Gene symbol'])
        else:
            mechanism_dict[mechanism] = [row['HUGO Gene symbol']]

# Create a new dataframe from the dictionary
mechanism_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in mechanism_dict.items() ]))

mechanism_dict = {}

# Iterate over the columns in mechanism_df
for mechanism in mechanism_df.columns:
    # Drop NaN values and convert the column to a list
    proteins_list = mechanism_df[mechanism].dropna().tolist()
    # Add to the dictionary
    mechanism_dict[mechanism] = proteins_list


937
587


  exec(code_obj, self.user_global_ns, self.user_ns)


### integrated selected pathways and protein signature grouping (IPS)

In [10]:

hallmarks = pd.read_csv("/mnt/ncshare/ozkilim/BRCA/data/HGSOC_processed_data/Hallmark_pathways.txt",sep='\t', on_bad_lines='skip')
c2 = pd.read_csv("/mnt/ncshare/ozkilim/BRCA/data/HGSOC_processed_data/selectedC2_pathways.txt",sep='\t', on_bad_lines='skip')
substring = 'REPAIR'
filtered_c2 = c2[c2['gs_name'].str.contains(substring)]
substring = "BRCA"
filtered_c_BRCA = c2[c2['gs_name'].str.contains(substring)]
substring = "HOMOLO"
filtered_c_recom = c2[c2['gs_name'].str.contains(substring)]
len(filtered_c_BRCA["gs_name"].value_counts())
filtered_c_BRCA.head()
substring = "DNA_REPAIR"
hallmarks = hallmarks[hallmarks['gs_name'].str.contains(substring)]
selected = pd.concat([filtered_c2, filtered_c_BRCA, hallmarks, filtered_c_recom], axis=0)
df = pd.read_csv("/mnt/ncshare/ozkilim/BRCA/HGSOC_platinum_responce/HGSOC_TCGA_main.csv",header=0,low_memory=False)
prots = get_all_prots(df)
print(len(prots))
print(len(selected))
selected = selected[selected['gene_symbol'].isin(prots)]
print(len(selected))
chowduhry_signature = ['RAB25', 'BCL2L1', 'HADH', 'NFKB2', 'COX7A2', 'COX7C', 'TPMT', 'GOLPH3L', 'LTA4H', 'COX6C', 'IDH1', 'YWHAG', 'S100A10', 'COX6A1', 'NDUFB3', 'TGM2', 'CDKN1B', 'NFKB1', 'CAMK2D', 'IL4I1', 'FDX1', 'VCAM1', 'ATM', 'NCAPH2', 'ABCB8', 'IDI1', 'PLIN2', 'ATP6V1D', 'GPX4', 'CA2', 'RELA', 'GLUD1', 'TOP3B', 'RPS6KB2', 'KEAP1', 'LGALS1', 'MTDH', 'AIFM1', 'RHOA', 'CASP7', 'PTGES2', 'TFRC', 'CHUK', 'GPX1', 'PDK1', 'STAT3', 'PECR', 'TALDO1', 'XIAP', 'ACADSB', 'CPOX', 'ARNT', 'BIRC2', 'ACOT7', 'HACL1', 'MYD88', 'EGFR', 'RIPK1', 'NBN', 'LDHA']
gs_dict = {gs_name: group["gene_symbol"].tolist() for gs_name, group in selected.groupby("gs_name")}
gs_dict["chowduhry_signature"] = chowduhry_signature

7567
2504
1604


In [11]:
gs_dict
import torch 
genomic_feats = []
# Create list of vectors for MCAT. 
for selected_prots in gs_dict.values():
    print(len(selected_prots))

# understand why shape is not 14..... but 16....?

14
133
25
14
21
8
50
35
9
1
14
5
92
208
21
82
35
24
138
14
31
105
20
7
6
79
74
80
25
21
90
8
7
50
58
60


In [12]:
# proteomics_combinations = {'chowdery_60_flat':chowdery_60,'chowdery_60_grouped':chowdery_60_grouped,'TCGA_flat_1k':TCGA_flat_1k,'TCGA_grouped_1k':TCGA_grouped,'plat_response_pathways':mechanism_dict,'IPS_pathways':gs_dict}
# # Save to a JSON file
# with open('proteomics_combinations.json', 'w') as file:
#     json.dump(proteomics_combinations, file)