# 1- Hypergraph Incidence Matrix

- Here, we will load the incidence matrix, features, and labels to construct the data needed for training.

In [None]:
import pandas as pd
import numpy as np
import pickle
import random
from scipy.sparse import coo_matrix


from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

import torch

import warnings
warnings.filterwarnings('ignore')

In [None]:
# global cancer list 
cancers =[
    'BLCA', 'BRCA', 'LUAD', 'HNSC', 'ESCA', 'CESC', 'PRAD', 'STAD', 'LIHC', 'THCA', 'LUSC', 'pancancer'
]

In [None]:
# i will load incidence into datafrmae
loaded_coo = np.load("/content/incidence_matrix__FI_coo13560.npz")
incidence_matrix_coo = coo_matrix(
    (loaded_coo["data"], (loaded_coo["row"], loaded_coo["col"])), shape=loaded_coo["shape"]
)
gene_names = loaded_coo["genes"]
pathway_names = loaded_coo["pathways"]

incidence_df = pd.DataFrame.sparse.from_spmatrix(incidence_matrix_coo, index=gene_names, columns=pathway_names)
incidence_df
genelist = incidence_df.index.tolist()
incidence_df

Unnamed: 0,BIOCARTA_41BB_PATHWAY,BIOCARTA_ACE2_PATHWAY,BIOCARTA_ACETAMINOPHEN_PATHWAY,BIOCARTA_ACH_PATHWAY,BIOCARTA_ACTINY_PATHWAY,BIOCARTA_AGPCR_PATHWAY,BIOCARTA_AGR_PATHWAY,BIOCARTA_AHSP_PATHWAY,BIOCARTA_AKAP13_PATHWAY,BIOCARTA_AKAP95_PATHWAY,...,WP_VITAMIN_D_RECEPTOR_PATHWAY,WP_WARBURG_EFFECT_MODULATED_BY_DEUBIQUITINATING_ENZYMES_AND_THEIR_SUBSTRATES,WP_WHITE_FAT_CELL_DIFFERENTIATION,WP_WNTBETACATENIN_SIGNALING_INHIBITORS_IN_CURRENT_AND_PAST_CLINICAL_TRIALS,WP_WNTBETACATENIN_SIGNALING_IN_LEUKEMIA,WP_WNT_SIGNALING_AND_PLURIPOTENCY,WP_WNT_SIGNALING_IN_KIDNEY_DISEASE,WP_WNT_SIGNALING_WP363,WP_WNT_SIGNALING_WP428,WP_ZINC_HOMEOSTASIS
A1BG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1CF,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A3GALT2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A4GALT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZWILCH,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZWINT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZYG11B,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZYX,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 2- Core omics features

In [None]:
def process_features(file_paths, incidence_df):
    """
    This function will load all the preprocessed mutation, expression, and methylation features then combined them, it will check if all
    the nodes are present in the network, then it will do some normalization step. 
    into a single big biological matrix.

    Parameters:
        file_paths (dict): Dictionary with keys as dataset names ('expression', 'meth', 'mutation')
                           and values as file paths to the datasets.

    Returns:
        pd.DataFrame: A combined, normalized features matrix.
    """
    # Read all preprocess omics feature matrix
    def read_and_process(file_path):
        df = pd.read_csv(file_path, sep='\t')
        df.columns = ['Name'] + [col.upper() for col in df.columns[1:]]
        df.set_index('Name', inplace=True)
        return df


    datasets = {name: read_and_process(path) for name, path in file_paths.items()}

    # Load network from the previous function with the node as index
    incidenc= incidence_df
    ppi_index = incidenc.index.tolist()


    # Find common cancer types across all datasets
    common_ctypes = list(
        set.intersection(*(set(df.columns) for df in datasets.values()))
    )

    # Filter datasets by common cancer types
    for name in datasets:
        datasets[name] = datasets[name][common_ctypes]

    # Check if all the nodes in the features same as the network
    reindexed_datasets = {name: df.reindex(ppi_index, fill_value=0) for name, df in datasets.items()}
    mutation_node = datasets['mutation'][datasets['mutation'].index.isin(ppi_index)].shape[0]
    print(f'Number of genes in mutation matrix: {mutation_node}')
    expr_nodes = datasets['expression'][datasets['expression'].index.isin(ppi_index)].shape[0]
    print(f'Number of genes in expression matrix: {expr_nodes}')
    meth_nodes = datasets['meth'][datasets['meth'].index.isin(ppi_index)].shape[0]
    print(f'Number of genes in methylation matrix: {meth_nodes}')


    # Normalize with MinMax
    scaler = MinMaxScaler()
    normalized_datasets = {
        name: pd.DataFrame(
            scaler.fit_transform(np.abs(df)),
            index=df.index,
            columns=[f"{name.upper()}_{col}" for col in df.columns]
        )
        for name, df in reindexed_datasets.items()
    }
    '''scaler = StandardScaler()

    normalized_datasets = {
    name: pd.DataFrame(
        scaler.fit_transform(np.abs(df)),  # Apply StandardScaler
        index=df.index,
        columns=[f"{name.upper()}_{col}" for col in df.columns]
    )
    for name, df in reindexed_datasets.items()
}'''

    # Combine datasets into a single feature matrix
    core_features =pd.concat(normalized_datasets.values(), axis=1)

    return core_features

 
file_paths = {
    "expression": "/content/drive/MyDrive/OncoPlex/cancer_data/TCGA/processed/expression_mean_logfold.tsv",
    "meth": "/content/drive/MyDrive/OncoPlex/cancer_data/TCGA/processed/methylation_genecancer.tsv",
    "mutation": "/content/drive/MyDrive/OncoPlex/cancer_data/TCGA/processed/mutation_genecancer.tsv"
}

core_features = process_features(file_paths, incidence_df)
core_features.to_csv('/content/drive/MyDrive/OncoPlex/cancer_data/TCGA/processed/core_features.tsv', sep='\t')

core_features.head()

Number of genes in mutation matrix: 12648
Number of genes in expression matrix: 13314
Number of genes in methylation matrix: 11671


Unnamed: 0_level_0,EXPRESSION_LIHC,EXPRESSION_PRAD,EXPRESSION_KIRC,EXPRESSION_COAD,EXPRESSION_KIRP,EXPRESSION_STAD,EXPRESSION_READ,EXPRESSION_UCEC,EXPRESSION_BRCA,EXPRESSION_HNSC,...,MUTATION_READ,MUTATION_UCEC,MUTATION_BRCA,MUTATION_HNSC,MUTATION_CESC,MUTATION_ESCA,MUTATION_THCA,MUTATION_LUSC,MUTATION_BLCA,MUTATION_LUAD
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.057044,0.069536,0.093236,0.025391,0.047868,0.05059,0.022431,0.026581,0.042643,0.085544,...,0.011118,0.046182,0.008502,0.019984,0.030481,0.0,0.0,0.00933,0.036646,0.025328
A1CF,0.094731,0.282134,0.003123,0.121657,0.204754,0.075999,0.076384,0.094288,0.013237,0.038985,...,0.007343,0.027396,0.024429,0.021146,0.044913,0.0,0.0,0.036181,0.017955,0.042958
A2M,0.097015,0.037951,0.023759,0.131797,0.110909,0.002152,0.100596,0.178535,0.129277,0.073737,...,0.052003,0.092203,0.027949,0.014702,0.047025,0.023726,0.003467,0.037269,0.064517,0.073051
A3GALT2,0.068231,0.029517,0.251819,0.006947,0.007957,0.008796,0.042488,0.012469,0.037956,0.067577,...,0.0,0.00201,0.0,0.0,0.020544,0.005985,0.0,0.001884,0.004654,0.002859
A4GALT,0.088318,0.087904,0.07128,0.046637,0.044104,0.086685,0.055393,0.044437,0.022587,0.00885,...,0.0,0.028588,0.005652,0.002511,0.020172,0.005925,0.0,0.007239,0.004634,0.0


In [None]:
dataset_OncoPlex = dict()
for cancer in cancers:

    if cancer == 'pancancer':
        cancer_features = core_features
    else:
        matching_columns = [col for col in core_features.columns if cancer in col]
        cancer_features = core_features[matching_columns]

    nodes = cancer_features.index.tolist()

    print(f"Features for {cancer}:\n", cancer_features.head())
    print(f"Number of genes in {cancer}: {len(nodes)}")

    dataset_OncoPlex.setdefault(cancer, {})['core_features'] = torch.tensor(cancer_features.values, dtype=torch.float32)
    dataset_OncoPlex.setdefault(cancer, {})['nodes'] = nodes

# 3- Labels

In [None]:
def get_labels(genes, cancer_type):

    if cancer_type == 'pancancer': 
        driver_path = f'/content/positive_pancancer.txt'
        driver_df = pd.read_csv(driver_path, sep='\t', header=None, names=['gene'])
        driver_genes = sorted(list(set(genes) & set(driver_df['gene'])))
        print(f"pancancer positive genes count: {len(driver_genes)}")

    elif cancer_type in ['BLCA', 'BRCA', 'LUAD', 'HNSC', 'ESCA', 'CESC', 'PRAD', 'STAD', 'LIHC', 'THCA', 'LUSC']:
        driver_path = f'/content/Positive_{cancer_type}_driver.txt'
        driver_df = pd.read_csv(driver_path, sep='\t', header=None, names=['gene'])
        driver_genes = sorted(list(set(genes) & set(driver_df['gene'])))
        print(f"[{cancer_type}] driver genes count: {len(driver_genes)}")

    else:
        driver_genes = []
        print(f"[{cancer_type}] no driver genes found")

    nondriver_df = pd.read_csv('/content/negative_nondriver.csv', index_col=None, names=['gene'])
    nondriver_genes = sorted(list(set(genes) & set(nondriver_df['gene'])))
    print(f"[{cancer_type}] nondriver genes count: {len(nondriver_genes)}")

    labels = pd.DataFrame(data=[-1] * len(genes), index=genes)
    labels.loc[driver_genes, 0] = 1
    labels.loc[nondriver_genes, 0] = 0

    driver_idx = labels.index.get_indexer(driver_genes)
    nondriver_idx = labels.index.get_indexer(nondriver_genes)

    sample_indices = np.concatenate([driver_idx, nondriver_idx])
    sample_labels = np.array([1] * len(driver_idx) + [0] * len(nondriver_idx))

    return sample_indices, sample_labels, labels, driver_genes, nondriver_genes


for cancer in cancers:
    print(f"\nProcessing cancer type: {cancer}")
    sample_indices, sample_labels, full_labels, driver_genes, nondriver_genes = get_labels(gene_names, cancer)
    dataset_OncoPlex[cancer]['label'] = torch.from_numpy(full_labels.values.ravel())

### - Weight for Hyperedges & Train/Test split

- Now, i have some pathways (hyperedges) related to cancer biology, like Cell_Cycle, DNA Repair, and other biological processes.

i have two way for weighting: eigther by count of genes (cancer) in the pathway so the pathway with more of these genes is important. The second approach by directly add the weight for the selected pathway because of its biological importance in the cancer developemnt.

## 4- Construct G from the weighted H >> N*N

In [None]:
def generate_G_from_H_weight(H, W):
    """
    This function generate the propagation matrix G for HGNN from incidence matrix H.
    Here i already define the incidence matrix H with weight from the training nodes .
    Adapted from HGNN github repo: https://github.com/iMoonLab/HGNN
    :param H: hypergraph weighted incidence matrix H
    :param variable_weight: whether the weight of hyperedge is variable
    :return: G
    """
    n_edge = H.shape[1]
    # the degree of the node: sum of the columns (hyperedges)
    DV = np.sum(H * W, axis=1)
    # the degree of the hyperedge: sum of the row (vertices)
    DE = np.sum(H, axis=0)
    #inverse of the square root of the diagonal D_v.
    invDE = np.asmatrix(np.diag(1/DE))
    DV2 = np.asmatrix(np.diag(np.power(DV, -0.5)))
    #replace nan with 0. This is caused by isolated nodes
    DV2 = np.nan_to_num(DV2)
    W = np.asmatrix(np.diag(W))
    H = np.asmatrix(H)
    HT = H.T
    G = DV2 * H * W * invDE * HT * DV2
    return G


for specific 

In [None]:
incidence_df = incidence_df.sparse.to_dense()

In [None]:
# adopted from https://github.com/genemine/DISHyper/blob/main/utils.py
def weighted_H(genes, incidence_df, cancer_type):
    # Get labels for pancancer or cancer-specific
    sample_indices, sample_labels, labels,_,_ = get_labels(genes, cancer_type)
 

    # Split dataset
    train_idx, test_idx, train_label, test_label = train_test_split(
        sample_indices, sample_labels, test_size=0.2, random_state=42, stratify=sample_labels, shuffle=True
    )
    train_idx, val_idx, train_label, val_label = train_test_split(
        train_idx, train_label, test_size=0.25, random_state=42, stratify=train_label
    )

    print(f"\n[{cancer_type}] Number of training samples:", len(train_idx))
    print(f"[{cancer_type}] Number of test samples:", len(test_idx))
    print(f"[{cancer_type}] Number of validation samples:", len(val_idx))
 
    trainframe = labels.iloc[train_idx]
    positive_train = trainframe[trainframe[0] == 1].dropna().index.tolist()

    print(f"[{cancer_type}] Number of positive genes in training:", len(positive_train))

    # Sum positive genes in each hyperedge
    positiveMatrix = incidence_df.loc[positive_train].sum()

    selHyperedgeIndex = np.where(positiveMatrix >= 2)[0] # 2 give me the best results 
    selHyperedge = incidence_df.iloc[:, selHyperedgeIndex]
    hyperedgeWeight = positiveMatrix[selHyperedgeIndex].values

    print(f"[{cancer_type}] Number of selected hyperedges:", len(selHyperedgeIndex))
 
    selHyperedgeWeightSum = incidence_df.iloc[:, selHyperedgeIndex].values.sum(axis=0)
    hyperedgeWeight = hyperedgeWeight / selHyperedgeWeightSum

    # Create weighted H 
    H = np.array(selHyperedge).astype(float)

    # for isolated nodes
    DV = np.sum(H * hyperedgeWeight, axis=1)
    for i in range(DV.shape[0]):
        if DV[i] == 0:
            t = random.randint(0, H.shape[1] - 1)
            H[i][t] = 0.0001

    G = generate_G_from_H_weight(H, hyperedgeWeight)

    return train_idx, val_idx, test_idx, train_label, val_label, test_label, G

for cancer in cancers:
    print(f"\nProcessing cancer type: {cancer}")
    train_idx, val_idx, test_idx, train_label, val_label, test_label, G = weighted_H(gene_names, incidence_df, cancer)
    dataset_OncoPlex[cancer]['train_idx'] = train_idx
    dataset_OncoPlex[cancer]['val_idx'] = val_idx
    dataset_OncoPlex[cancer]['test_idx'] = test_idx
    dataset_OncoPlex[cancer]['edge_index'] = G
   

  # pancancer
dataset_OncoPlex['pancancer']['train_idx']= train_idx
dataset_OncoPlex['pancancer']['val_idx'] = val_idx
dataset_OncoPlex['pancancer']['test_idx'] = test_idx

dataset_OncoPlex['pancancer']['edge_index'] = G



# 5- Comprehensive features

In [None]:
# read all the data before filtering 
all_features =pd.read_csv('/content/All_features.csv')
all_features

Unnamed: 0,Gene,Silent_mutations_kb,log_Total_N_missense_mutations,log_Total_N_LoF_mutations,log_Total_N_of_splicing_mutations,Missense_mutations_kb,LoF_mutations_kb,Missense_entropy,LOF_silent_ratio,Splice_silent_ratio,...,Height_of_H3K9ac_peaks,H3K9me2_peak_length,Percentage_of_broad_H3K9me2_peaks,Height_of_H3K9me2_peaks,H3K79me2_peak_length,Percentage_of_broad_H3K79me2_peaks,Height_of_H3K79me2_peaks,H4K20me1_peak_length,Percentage_of_broad_H4K20me1_peaks,Height_of_H4K20me1_peaks
0,NR2F2,48.19,6.98,4.09,0.00,100.40,12.85,0.10,0.27,0.00,...,58.57,267.50,0.00,3.12,8956.96,0.15,32.57,4530.42,0.11,9.83
1,TNRC18,21.56,8.64,5.67,1.00,44.80,5.61,0.08,0.26,0.01,...,73.22,198.00,0.00,3.61,6075.37,0.19,22.80,4401.00,0.16,14.01
2,MEIS1,30.04,6.70,3.17,1.58,73.68,5.72,0.06,0.19,0.05,...,16.98,684.88,0.00,4.11,4030.95,0.05,8.12,1695.84,0.05,8.75
3,ZEB2,51.08,8.75,5.70,1.00,115.59,13.71,0.09,0.27,0.01,...,66.04,454.62,0.03,4.32,7839.54,0.26,28.91,4185.37,0.17,21.50
4,BCL6,33.95,8.06,4.91,0.00,124.94,13.67,0.18,0.40,0.00,...,76.43,179.50,0.00,2.99,7556.35,0.32,27.94,5748.00,0.25,15.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19631,RLTPR,34.78,8.33,5.49,2.81,72.49,9.94,0.10,0.29,0.04,...,54.09,0.00,0.00,0.00,3037.00,0.03,17.89,3776.89,0.10,9.25
19632,SPTY2D1,16.03,7.14,5.04,0.00,68.03,15.55,0.07,0.91,0.00,...,65.70,223.25,0.00,3.66,9435.97,0.47,19.23,5512.28,0.28,14.39
19633,UGT2A2,0.00,2.32,1.58,0.00,2.48,1.24,0.00,0.96,0.04,...,6.20,647.00,0.03,4.31,123.25,0.00,3.38,834.22,0.01,4.35
19634,WDR47,10.06,7.63,5.17,1.00,70.76,12.57,0.06,1.16,0.04,...,61.40,235.00,0.00,3.84,6433.95,0.30,16.73,2820.68,0.09,16.28


In [None]:
numerical_features = all_features.select_dtypes(include=np.number)

# Normalize it 
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(numerical_features)


all_features_scaled = pd.DataFrame(scaled_features, columns=numerical_features.columns, index=all_features.index)

non_numeric_cols = all_features.select_dtypes(exclude=np.number).columns
if not non_numeric_cols.empty:
    all_features_scaled = pd.concat([all_features_scaled, all_features[non_numeric_cols]], axis=1)

all_features_scaled


In [None]:
# I want only the subset of these features 

columns_to_extract = [
    # --- Mutation features ---
    'log_Total_N_LoF_mutations',
    'log_Total_N_missense_mutations',
    'Missense_mutations_kb',
    'LoF_mutations_kb',
    'Missense_damaging_benign_ratio',
    'LoF_o_e_constraint',
    'Missense_entropy',
    'Silent_fraction',
    'NonSilent_silent_ratio',
    'HiFI_missense_LoFI_missense_ratio',
    'Missense_silent_ratio',
    'Missense_benign_ratio',
    'Missense_o_e_constraint',
    'LOF_silent_ratio',
    'LOF_benign_ratio',
    'LOF_total_ratio',
    'Silent_mutations_kb',
    'LOF_missense_ratio',
    'log_Total_N_of_splicing_mutations',
    'Missense_total_ratio',

    # --- Epigenetics features ---
    'Height_of_H4K20me1_peaks',
    'H3K4me1_peak_length',
    'H3K4me2_peak_length',
    'H4K20me1_peak_length',
    'H3K4me3_peak_length',
    'H3K9ac_peak_length',
    'H3K79me2_peak_length',
    'H3K27ac_peak_length',
    'Height_of_H3K79me2_peaks',
    'Height_of_H3K36me3_peaks',
    'H3K36me3_peak_length',
    'Height_of_H3K27ac_peaks',
    'Height_of_H3K9ac_peaks',
    'Height_of_H3K4me2_peaks',
    'Height_of_H3K4me1_peaks',
    'Gene_body_hypermethylation_in_cancer',

    # --- Genomics features ---
    'RVIS_percentile',
    'ncGERP_score',
    'Exon_conservation_phastCons_score',
    'log_CDS_length',
    'Primate_dN_dS_ratio',
    'log_gene_length',
    'Missense_MGAentropy',

    # --- Phenotype feature ---
    'VEST_score',
    #===========
    'Gene'
]


all_features_scaled = all_features_scaled[columns_to_extract]
all_features_scaled

Unnamed: 0,log_Total_N_LoF_mutations,log_Total_N_missense_mutations,Missense_mutations_kb,LoF_mutations_kb,Missense_damaging_benign_ratio,LoF_o_e_constraint,Missense_entropy,Silent_fraction,NonSilent_silent_ratio,HiFI_missense_LoFI_missense_ratio,...,Gene_body_hypermethylation_in_cancer,RVIS_percentile,ncGERP_score,Exon_conservation_phastCons_score,log_CDS_length,Primate_dN_dS_ratio,log_gene_length,Missense_MGAentropy,VEST_score,Gene
0,0.309614,0.453541,0.001330,0.001674,0.004529,0.465856,0.018116,0.30,0.007161,0.009508,...,0.832432,0.2623,0.770833,0.733333,0.614833,0.0002,0.644749,0.158940,0.739130,NR2F2
1,0.429220,0.561404,0.000593,0.000731,0.005586,0.561792,0.014493,0.29,0.007365,0.004028,...,0.124324,0.5001,0.400463,0.722222,0.784689,0.0022,0.796121,0.400662,0.478261,TNRC18
2,0.239970,0.435348,0.000976,0.000745,0.004831,0.490738,0.010870,0.27,0.007900,0.007545,...,0.805405,0.3146,0.869213,0.900000,0.625000,0.0020,0.807947,0.149007,0.771739,MEIS1
3,0.431491,0.568551,0.001531,0.001786,0.006568,0.537462,0.016304,0.28,0.007747,0.005316,...,0.210811,0.1003,0.813657,0.833333,0.709330,0.0012,0.807001,0.298013,0.684783,ZEB2
4,0.371688,0.523717,0.001655,0.001780,0.010568,0.486038,0.032609,0.20,0.011570,0.006932,...,0.918919,0.2357,0.828704,0.777778,0.660885,0.0014,0.689215,0.298013,0.630435,BCL6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19631,0.415594,0.541261,0.000960,0.001295,0.004227,0.503732,0.018116,0.29,0.007416,0.004601,...,0.778378,0.0661,0.444444,0.722222,0.724282,0.0040,0.644749,0.384106,0.489130,RLTPR
19632,0.381529,0.463938,0.000901,0.002025,0.006266,0.491015,0.012681,0.16,0.014399,0.003087,...,0.767568,0.7458,0.592593,0.744444,0.658493,0.0084,0.699149,0.377483,0.336957,SPTY2D1
19633,0.119606,0.150747,0.000033,0.000161,0.003850,0.375449,0.000000,0.00,0.014017,0.002311,...,0.491892,0.9641,0.305556,0.744444,0.636962,0.0028,0.738884,0.294702,0.543478,UGT2A2
19634,0.391370,0.495776,0.000937,0.001637,0.016759,0.500415,0.010870,0.11,0.021662,0.008874,...,0.491892,0.1345,0.689815,0.833333,0.684211,0.0008,0.763009,0.231788,0.750000,WDR47


In [None]:
# The 44 comprehensive feature that will be used in the second experiment
filtered_features = all_features_scaled[all_features['Gene'].isin(incidence_df.index)]
filtered_features = filtered_features.set_index("Gene").reindex(incidence_df.index, fill_value=0)

print(filtered_features)

In [None]:
# Now i will add it seperately for pancancer 
dataset_OncoPlex['pancancer']['comp_features'] = torch.FloatTensor(filtered_features.values)

# for speceifc i will add it with the core features which we already have
for cancer in cancers:
    matching_columns = [col for col in core_features.columns if cancer in col]
    cancer_features = core_features[matching_columns]
    #nodes = cancer_features.index.tolist()
    combined_features = pd.concat([cancer_features, filtered_features], axis=1)
    dataset_OncoPlex[cancer]['comp_features'] = torch.FloatTensor(combined_features.values)

    print(f"Comprehensive features for {cancer}:\n", combined_features.head())
 


with open('/content/drive/MyDrive/OncoPlex/cancer_data/TCGA/processed/OncoPlex_dataset.pkl', 'wb') as f:
    pickle.dump(dataset_OncoPlex, f)

###  Now, we have the weighted hypergraph, the two features sets, labels, the train/val/test split for both pancancer and cancer specific in this data dict.