In [1]:
import pandas as pd
from copy import deepcopy
import random
base="/home/colombelli/Documents/datasets/graph-omics/data/"

In [2]:
gene = pd.read_csv(base+"gene_proc.csv", index_col=0)
mirna = pd.read_csv(base+"mirna_proc.csv", index_col=0)
meth = pd.read_csv(base+"meth_proc.csv", index_col=0)

In [32]:
def convert_matrix_to_list(df, features):            
    
    samples_len = len(df)
    sid = ['-'.join(i.split('-')[0:-1]) for i in df.index]
    cls = ["normal" if i==0 else "tumoral" for i in df['class'].values]
    
    adj_list = {
        'sample_id': [],
        'symbol': [],
        'id': [],
        'class': [],
        'expression': []
        }
    
    for feature in features:
        adj_list['sample_id'] = adj_list['sample_id'] + deepcopy(sid)
        adj_list['symbol'] = adj_list['symbol'] + [feature.split('|')[0]] * samples_len
        try:
            adj_list['id'] = adj_list['id'] + [feature.split('|')[1]] * samples_len
        except:
            adj_list['id'] = adj_list['id'] + ['-']*samples_len
        adj_list['class'] = adj_list['class'] + deepcopy(cls)
        adj_list['expression'] = adj_list['expression'] + list(deepcopy(df[feature].values))

            
        
    
    adj_df = pd.DataFrame.from_dict(adj_list)
    print(f"Number of edges: ", len(adj_df))
        
    return adj_df
    
    
def save_class_separately(gene, mirna, meth, number_of_features):
    
    gene_features = random.sample(list(gene.columns[:-1].values), number_of_features)
    mirna_features = random.sample(list(mirna.columns[:-1].values), number_of_features)
    meth_features = random.sample(list(meth.columns[:-1].values), number_of_features)
    
    
    normal_gene_adj_list =  convert_matrix_to_list(gene.loc[gene['class']==0], gene_features)
    tumoral_gene_adj_list =  convert_matrix_to_list(gene.loc[gene['class']==1], gene_features)
    
    normal_mirna_adj_list =  convert_matrix_to_list(mirna.loc[mirna['class']==0], mirna_features)
    tumoral_mirna_adj_list =  convert_matrix_to_list(mirna.loc[mirna['class']==1], mirna_features)
    
    normal_meth_adj_list =  convert_matrix_to_list(meth.loc[meth['class']==0], meth_features)
    tumoral_meth_adj_list =  convert_matrix_to_list(meth.loc[meth['class']==1], meth_features)
    
    #save!
    print("Saving....")
    normal_gene_adj_list.to_csv(f"{base}normal_gene_adj_list.csv")
    tumoral_gene_adj_list.to_csv(f"{base}tumoral_gene_adj_list.csv")
    normal_mirna_adj_list.to_csv(f"{base}normal_mirna_adj_list.csv")
    tumoral_mirna_adj_list.to_csv(f"{base}tumoral_mirna_adj_list.csv")
    normal_meth_adj_list.to_csv(f"{base}normal_meth_adj_list.csv")
    tumoral_meth_adj_list.to_csv(f"{base}tumoral_meth_adj_list.csv")
    
    return [normal_gene_adj_list, tumoral_gene_adj_list,
            normal_mirna_adj_list, tumoral_mirna_adj_list,
            normal_meth_adj_list, tumoral_meth_adj_list]

In [33]:
dfs = save_class_separately(gene, mirna, meth, 500)

Number of edges:  36000
Number of edges:  267000
Number of edges:  35500
Number of edges:  127500
Number of edges:  12000
Number of edges:  228000
Saving....


In [50]:
dfs[2]

Unnamed: 0,sample_id,symbol,id,class,expression
0,TCGA-A3-3358,hsa-miR-6877-5p,MIMAT0027654,normal,
1,TCGA-A3-3387,hsa-miR-6877-5p,MIMAT0027654,normal,
2,TCGA-B0-4700,hsa-miR-6877-5p,MIMAT0027654,normal,
3,TCGA-B0-4712,hsa-miR-6877-5p,MIMAT0027654,normal,
4,TCGA-B0-5402,hsa-miR-6877-5p,MIMAT0027654,normal,
...,...,...,...,...,...
35495,TCGA-CZ-5985,hsa-miR-6805-5p,MIMAT0027510,normal,
35496,TCGA-CZ-5986,hsa-miR-6805-5p,MIMAT0027510,normal,
35497,TCGA-CZ-5987,hsa-miR-6805-5p,MIMAT0027510,normal,
35498,TCGA-CZ-5988,hsa-miR-6805-5p,MIMAT0027510,normal,


In [37]:
ppi = pd.read_csv(base+"ppi_processed.txt", sep='\t')

In [38]:
len(ppi)

11759454

In [41]:
meth_adj_list

Unnamed: 0,sample_id,symbol,id,class,expression
0,TCGA-3Z-A93Z-01,RDX,-,tumoral,0.196393
1,TCGA-6D-AA2E-01,RDX,-,tumoral,0.236285
2,TCGA-A3-3357-11,RDX,-,tumoral,0.201089
3,TCGA-A3-3357-01,RDX,-,tumoral,0.191568
4,TCGA-A3-3358-01,RDX,-,tumoral,0.197600
...,...,...,...,...,...
239995,TCGA-MM-A563-01,HRK,-,tumoral,0.203289
239996,TCGA-MM-A564-01,HRK,-,tumoral,0.187076
239997,TCGA-MM-A84U-01,HRK,-,tumoral,0.213330
239998,TCGA-MW-A4EC-01,HRK,-,tumoral,0.193401
