In [1]:
import os
import sys
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import torch
from sklearn.model_selection import KFold

In [2]:
mrna_data = pd.read_table("data/TCGA_mrna_counts_match_iso.tsv", sep='\t', index_col=[0])
mirna_data = pd.read_table("data/TCGA_mirna_counts_match_iso.tsv", sep='\t', index_col=[0])

In [14]:
def generate_correlated_miRNA(mRNA_data, n_miRNA=2500, corr_range=(0.6, 0.9), scale_range=(0.01, 0.1), seed=42):
    n_sample, n_gene = mRNA_data.shape
    
    np.random.seed(seed=seed)

    # Extract color and cancer_type columns
    color_column = mRNA_data.iloc[:, -2]
    cancer_type_column = mRNA_data.iloc[:, -1]
    
    # Remove color and cancer_type columns from mRNA data
    mRNA_data = mRNA_data.iloc[:, :-2]
    
    # Generate random correlation coefficients and scaling factors for each miRNA
    corr_coeffs = np.random.uniform(corr_range[0], corr_range[1], size=n_miRNA)
    scale_factors = np.random.uniform(scale_range[0], scale_range[1], size=n_miRNA)
    
    # Initialize miRNA expression matrix
    miRNA_data = np.zeros((n_sample, n_miRNA))
    
    # Generate correlated miRNA expression values
    for i in range(n_miRNA):
        # Randomly select a gene from mRNA data
        gene_idx = np.random.randint(0, n_gene)
        mRNA_expr = mRNA_data.iloc[:, gene_idx].values
        
        # Generate correlated miRNA expression values
        miRNA_expr = corr_coeffs[i] * mRNA_expr + np.random.normal(0, 0.1, size=n_sample)
        miRNA_expr = np.maximum(miRNA_expr, 0)  # Ensure non-negative values
        
        # Scale miRNA expression values
        miRNA_expr = miRNA_expr * scale_factors[i]
        
        # Assign miRNA expression values to the matrix
        miRNA_data[:, i] = miRNA_expr
    
    # Create miRNA expression dataframe
    miRNA_df = pd.DataFrame(miRNA_data, columns=[f'miRNA_{i+1}' for i in range(n_miRNA)])
    
    # Add color and cancer_type columns to the miRNA dataframe
    miRNA_df['color'] = color_column
    miRNA_df['cancer_type'] = cancer_type_column
    
    return miRNA_df

In [15]:
mirna_toy = generate_correlated_miRNA(mrna_data, n_miRNA=200)
mirna_toy.head()

Unnamed: 0,miRNA_1,miRNA_2,miRNA_3,miRNA_4,miRNA_5,miRNA_6,miRNA_7,miRNA_8,miRNA_9,miRNA_10,...,miRNA_193,miRNA_194,miRNA_195,miRNA_196,miRNA_197,miRNA_198,miRNA_199,miRNA_200,color,cancer_type
0,141.159461,0.673873,0.889697,0.0,1208.559049,0.038616,41.301532,409.270529,48.164125,210.071634,...,159.896517,34.710599,223.028069,12.920881,0.867411,0.003698,37.083427,0.322961,,
1,76.822931,60.03831,0.372628,0.029229,651.51141,0.012874,52.131604,132.094907,42.128394,133.484111,...,70.503626,34.273961,32.210285,6.209429,0.046426,0.004453,28.071058,0.209921,,
2,62.169403,16.344252,0.372707,0.191939,1073.968765,0.036245,21.4299,240.368587,180.402272,215.226427,...,134.14607,33.681063,98.569061,20.584254,0.997438,0.0,126.25613,0.958382,,
3,58.731777,6.536708,0.565245,0.238131,1095.076258,0.024701,28.841123,105.70764,314.856082,255.79913,...,125.743513,30.53707,81.566764,9.141672,5.246073,0.002294,19.492799,4.555517,,
4,17.256123,6.98069,3.477388,0.028136,917.602652,0.001621,21.853396,81.037624,364.319372,72.946225,...,78.130225,12.810855,34.535599,4.508193,0.050976,0.005889,34.544521,0.208045,,


In [13]:
mrna_data.head()

Unnamed: 0,ENSG00000186092.4,ENSG00000278566.1,ENSG00000273547.1,ENSG00000187634.11,ENSG00000188976.10,ENSG00000187961.13,ENSG00000187583.10,ENSG00000187642.9,ENSG00000188290.10,ENSG00000187608.8,...,ENSG00000188120.14,ENSG00000205944.11,ENSG00000172352.5,ENSG00000183795.8,ENSG00000187191.14,ENSG00000205916.11,ENSG00000185894.8,ENSG00000172288.7,color,cancer_type
TCGA-EW-A6SA-01A-21R-A32P-07,0.0,0.0,1.0,1413.0,9723.0,1310.0,216.0,84.0,972.0,834.0,...,6.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,#ED1E91,BRCA
TCGA-E2-A14W-01A-11R-A12D-07,0.0,0.0,0.0,1562.0,6455.0,299.0,489.0,249.0,69.0,538.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,#ED1E91,BRCA
TCGA-EW-A1PD-01A-11R-A144-07,0.0,0.0,1.0,912.0,7050.0,439.0,86.0,23.0,357.0,1281.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,#ED1E91,BRCA
TCGA-55-1594-01A-01R-0946-07,0.0,0.0,0.0,501.0,4238.0,459.0,29.0,13.0,134.0,1300.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,#D2C3DF,LUAD
TCGA-49-6742-11A-01R-1858-07,0.0,0.0,0.0,321.0,1791.0,93.0,42.0,18.0,113.0,831.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,#D2C3DF,LUAD
