# DREAM5

# Data Exploration

First, I want to assert a couple of statements:
- The set of regulating genes present in the reference network is a subset of the set of transcription factors.
- There are elements of the set of transcription factors in the set of the reference network's target genes. 

We need to encapsulate the data wrangling for each source of datasets. Here (TODO: Subject to change), the source of datasets is either 'DREAM5' or 'BEELINE'.

In [1]:
from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd
from numpy.typing import NDArray

network_id_to_directory_name = {
    1: Path("Network 1 - in silico"),
    2: Path("Network 2 - S. aureus"),
    3: Path("Network 3 - E. coli"),
    4: Path("Network 4 - S. cerevisiae"),
}


def load_dream5(
    root: Path = Path("../data/raw/syn2787209/Gene Network Inference"),
    network_id: int = 1,
) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
    assert root.exists(), f"Path {root} does not exist"
    training_data_dir = root / "training data"
    reference_network_dir = root / "test data"
    network_dir = network_id_to_directory_name[network_id]

    gene_expressions = pd.read_csv(
        training_data_dir
        / network_dir
        / f"net{network_id}_expression_data.tsv",
        sep="\t",
        dtype=np.float32,
    )
    gene_ids = pd.read_csv(
        training_data_dir / network_dir / f"net{network_id}_gene_ids.tsv",
        sep="\t",
        dtype=str,
    )
    gene_ids = dict(gene_ids.values)
    transcription_factors = pd.read_csv(
        training_data_dir
        / network_dir
        / f"net{network_id}_transcription_factors.tsv",
        sep="\t",
        header=None,
        dtype=str,
    )

    ref_network = pd.read_csv(
        reference_network_dir
        / f"DREAM5_NetworkInference_GoldStandard_{str(network_dir).replace(f'Network {network_id}', f'Network{network_id}')}.tsv",
        sep="\t",
        header=None,
        dtype={
            0: str,
            1: str,
            2: float,
        },
    )
    # Name columns in accordance to GENIE3
    ref_network.columns = ["regulator_gene", "target_gene", "ground_truth"]

    # Turn 1D DataFrame to Series and name it
    transcription_factors = transcription_factors[0]
    transcription_factors.name = "transcription_factors"
    return {
        "gene_expressions": gene_expressions,
        "transcription_factors": transcription_factors,
        "ref_network": ref_network,
        "gene_ids": gene_ids,
    }


def _get_transcription_factor_indices(
    transcription_factors: pd.Series,
) -> List[int]:
    assert len(set(transcription_factors)) == len(
        transcription_factors
    ), "Transcription factors are not unique"
    transcription_factor_indices = list(range(len(transcription_factors)))
    return transcription_factor_indices


gene_expressions, transcription_factors, ref_network, gene_ids = load_dream5(
    network_id=1
).values()
inputs: NDArray = gene_expressions.values
transcription_factor_indices = _get_transcription_factor_indices(
    transcription_factors
)

In [2]:
# Check if the regulators and targets in the reference network are a subset of the transcription factors
unique_tfs = set(transcription_factors.unique())
unique_regulators_ref_network = set(ref_network["regulator_gene"].unique())
unique_targets_ref_network = set(ref_network["target_gene"].unique())

print("Number of TFs: ", len(unique_tfs))
print(
    f"All transcription factor entries are unique: {len(unique_tfs) == len(transcription_factors)}"
)
print(
    "Number of unique regulators in ref network: ",
    len(unique_regulators_ref_network),
)
print(
    f"All regulator genes in the reference network are unique: {len(unique_regulators_ref_network) == len(ref_network['regulator_gene'].unique())}"
)
print(
    f"Set of regulator genes in the reference network is subset of set of TFs: {unique_regulators_ref_network.issubset(unique_tfs)}"
)
print(
    "Number of unique target genes in ref network: ",
    len(unique_targets_ref_network),
)

print(
    f"Set of regulator genes in the reference network that are not TFs: {unique_regulators_ref_network.difference(unique_tfs)}"
)
print(
    f"Transciption factors are present in the set of target genes: {any(unique_tfs.intersection(unique_targets_ref_network))}"
)

Number of TFs:  195
All transcription factor entries are unique: True
Number of unique regulators in ref network:  178
All regulator genes in the reference network are unique: True
Set of regulator genes in the reference network is subset of set of TFs: True
Number of unique target genes in ref network:  1565
Set of regulator genes in the reference network that are not TFs: set()
Transciption factors are present in the set of target genes: True


In [3]:
from fedgenie3.genie3.modeling import GENIE3

tree_method = "RF"
tree_init_kwargs = {
    "n_estimators":  100,
    "max_features": 'sqrt',
    "random_state": 42,
    "n_jobs": -1,
}
genie3 = GENIE3(tree_method=tree_method, tree_init_kwargs=tree_init_kwargs)

In [6]:
importance_matrix = genie3.compute_importances(inputs, transcription_factor_indices)

Computing importances:   0%|          | 0/1643 [00:00<?, ?gene/s]

In [7]:
gene_ranking = genie3.get_gene_ranking(importance_matrix, transcription_factor_indices)
gene_ranking

Unnamed: 0,regulator_gene,target_gene,importance
0,187,937,0.182317
1,83,589,0.180860
2,94,469,0.169391
3,94,1105,0.167880
4,83,426,0.167562
...,...,...,...
320380,72,72,0.000000
320381,39,39,0.000000
320382,19,19,0.000000
320383,10,10,0.000000


In [8]:
def fn(x, gene_expressions):
    gene_names = gene_expressions.columns
    x = gene_names[x]
    return x

gene_cols = ['regulator_gene', 'target_gene']
gene_ranking[gene_cols] = gene_ranking[gene_cols].apply(lambda x : fn(x, gene_expressions), axis=0)
gene_ranking

Unnamed: 0,regulator_gene,target_gene,importance
0,G188,G938,0.182317
1,G84,G590,0.180860
2,G95,G470,0.169391
3,G95,G1106,0.167880
4,G84,G427,0.167562
...,...,...,...
320380,G73,G73,0.000000
320381,G40,G40,0.000000
320382,G20,G20,0.000000
320383,G11,G11,0.000000


In [None]:
from fedgenie3.genie3.eval import evaluate

evaluate(gene_ranking, ref_network)

{'auroc': 0.8240701674817695,
 'auroc_p': 0.000999000999000999,
 'auprc': 0.2711146041283269,
 'aupr_p': 0.000999000999000999,
 'overall_score': 3.000434077479319}