In [6]:
import numpy as np
import pandas as pd
from scoit import sc_multi_omics
import time
from scipy.sparse import csr_matrix
import seaborn as sns
import hypernetx as hnx
from enum import Enum, unique

def load_data():
    expression_data = np.array(pd.read_csv("data/sc_GEM/expression_data.csv", index_col=0))
    methylation_data = np.array(pd.read_csv("data/sc_GEM/methylation_data.csv", index_col=0))
    cell_stage = np.array(pd.read_csv("data/sc_GEM/cell_stage.csv", header=None))[0]
    labels = []
    for each in cell_stage:
        if each == "BJ":
            labels.append(0)
        if each == "d8":
            labels.append(1)
        if each == "d16T-" or each == "d16T+":
            labels.append(2)
        if each == "d24T-" or each == "d24T+":
            labels.append(3)
        if each == "IPS":
            labels.append(4)
        if each == "ES":
            labels.append(5)

    return expression_data, methylation_data, labels


In [7]:
@unique
class NodeType(Enum):
    OMICS = 0
    GENES = 1
    CELLS = 2


In [8]:
start_time = time.time()
expression_data, methylation_data, labels = load_data()
data = np.array([expression_data, methylation_data])
print(data.shape)
time.time() - start_time

(2, 224, 59)


0.01871657371520996

In [15]:
np.count_nonzero(expression_data) / np.size(expression_data)

0.6524667070217918

In [16]:
np.count_nonzero(methylation_data) / np.size(methylation_data)

0.8285411622276029

In [17]:
num_cells = methylation_data.shape[0]
num_genes = expression_data.shape[1] + methylation_data.shape[1]
num_omics = 2

total_nodes = sum([num_cells, num_genes, num_omics])
total_edges = np.count_nonzero(expression_data) + np.count_nonzero(methylation_data)

print(total_edges / total_nodes**2)
num_cells, num_genes, num_omics, total_nodes, total_edges

0.1654019064359113


(224, 118, 2, 344, 19573)

In [18]:
# Omics, Genes, Cells
omics_offset = 0
genes_offset = num_omics
cells_offset = num_omics + num_genes
omics_offset, genes_offset, cells_offset

(0, 2, 120)

In [11]:
def get_node_type(node_id: int):
    if omics_offset <= node_id < genes_offset:
        return NodeType.OMICS
    if genes_offset <= node_id < cells_offset:
        return NodeType.GENES
    if cells_offset <= node_id < total_nodes:
        return NodeType.CELLS
    raise ValueError(node_id)

In [25]:
# Edge format: (Omic, Cell, Gene, Weight)
def extract_hyperedges_from_feature_matrix(feature_matrix, omics_id, return_dict=True):
    omics_node = omics_id + omics_offset
    for row, col in zip(*np.nonzero(feature_matrix)):
        weight = feature_matrix[row, col]
        # Row is cell id, col is gene id. Convert to node id
        cell_node = row + cells_offset
        gene_node = col + genes_offset
        if return_dict:
            yield dict(omics_node=omics_node, cell_node=cell_node, gene_node=gene_node, weight=weight)
        else:
            yield omics_node, cell_node, gene_node, weight

In [27]:
from itertools import chain

df = pd.DataFrame.from_records(chain(
    extract_hyperedges_from_feature_matrix(expression_data, 0),
    extract_hyperedges_from_feature_matrix(methylation_data, 1)))
df

Unnamed: 0,omics_node,cell_node,gene_node,weight
0,0,120,2,
1,0,120,4,
2,0,120,5,
3,0,120,8,
4,0,120,11,
...,...,...,...,...
19568,1,343,56,
19569,1,343,57,
19570,1,343,58,
19571,1,343,59,1.0


In [29]:
weight_nan_idx = df.weight.isna()

In [33]:
sum(weight_nan_idx) / len(weight_nan_idx)

0.6523271854084709

In [35]:
len(df[df.weight.isna()]), len(df)

(12768, 19573)

In [36]:
df.to_csv('HE_sc_GEM_with_na.csv')