# Description

For some gene pairs of interest, it reads the probabilities of interactions in predicted networks from GIANT.
Then it writes networks stats in a table in a markdown file (from the manuscript).
Two networks per gene pair are read/written: blood and an autodetected cell type (from GIANT).

From GIANT, we use all interaction data types, the suggested minimum cut and top 15 genes for each networks (there are all default values in the GIANT web app).

# Modules

In [1]:
import re
from functools import partial

import pandas as pd

from ccc import conf

# Settings

In [2]:
GENE_FILE_MARK_TEMPLATE = "| *{gene}* |"

In [3]:
GENE0_STATS_TEMPLATE = '| *{gene}* | {blood_min} | {blood_avg} | {blood_max} | {cell_type}<!-- $rowspan="2" --> | {pred_min} | {pred_avg} | {pred_max} |'
GENE1_STATS_TEMPLATE = '| *{gene}* | {blood_min} | {blood_avg} | {blood_max} | {pred_min} | {pred_avg} | {pred_max}<!-- $removenext="2" --> |'

# Paths

In [4]:
assert (
    conf.MANUSCRIPT["BASE_DIR"] is not None
), "The manuscript directory was not configured"

In [5]:
OUTPUT_FILE_PATH = conf.MANUSCRIPT["CONTENT_DIR"] / "20.00.supplementary_material.md"
display(OUTPUT_FILE_PATH)
assert OUTPUT_FILE_PATH.exists()

PosixPath('/opt/manuscript/content/20.00.supplementary_material.md')

In [6]:
INPUT_DIR = conf.GIANT["RESULTS_DIR"] / "intersection_genes"
display(INPUT_DIR)

assert INPUT_DIR.exists()

PosixPath('/opt/data/results/giant/intersection_genes')

# Functions

In [7]:
def read_data(gene0, gene1, tissue_name=None, return_predicted_tissue=False):
    """
    Given a pair of genes, it returns the GIANT network data.
    If tissue_name is not None, it specifies the name of the tissue.
    If None, it means the autodetected tissue/cell type.
    """
    tissue_suffix = f"-{tissue_name}" if tissue_name is not None else ""

    file_pattern = f"???-{gene0.lower()}_{gene1.lower()}{tissue_suffix}.h5"
    files = list(INPUT_DIR.rglob(file_pattern))
    if len(files) == 0:
        file_pattern = f"???-{gene1.lower()}_{gene0.lower()}{tissue_suffix}.h5"
        files = list(INPUT_DIR.rglob(file_pattern))

    assert len(files) == 1, len(files)
    input_filepath = files[0]
    assert input_filepath.exists()

    data = pd.read_hdf(input_filepath, key="data")

    assert (
        (gene0 in data["gene1"].unique()) or (gene0 in data["gene2"].unique())
    ) and ((gene1 in data["gene1"].unique()) or (gene1 in data["gene2"].unique()))

    if return_predicted_tissue:
        return data, pd.read_hdf(input_filepath, key="metadata").iloc[0]["tissue"]

    return data

In [8]:
# testing
_tmp0 = read_data("IFNG", "SDS", "blood")
assert _tmp0.shape[0] == 127
display(_tmp0.shape)

_tmp1 = read_data("IFNG", "SDS")
assert _tmp1.shape[0] == 124
display(_tmp1.shape)

_tmp1_tissue = read_data("IFNG", "SDS", return_predicted_tissue=True)[1]
assert _tmp1_tissue == "natural-killer-cell"

_tmp10 = read_data("PRSS36", "CCL18")
assert _tmp10.shape[0] > 1
_tmp11 = read_data("CCL18", "PRSS36")
assert _tmp11.shape == _tmp10.shape

(127, 3)

(124, 3)

In [9]:
def format_number(number):
    return f"{number:.2f}"

In [10]:
# testing
assert format_number(0.222222) == "0.22"
assert format_number(0.225222) == "0.23"

In [11]:
def get_gene_stats(df, gene_name):
    """
    Returns stats of interaction probabilities for a gene in data.
    """
    gene_data = df[(df["gene1"] == gene_name) | (df["gene2"] == gene_name)]
    return gene_data.describe().squeeze()

In [12]:
# testing
_tmp0_stats = get_gene_stats(_tmp0, "IFNG")
assert _tmp0_stats["min"].round(2) == 0.19
assert _tmp0_stats["mean"].round(2) == 0.42
assert _tmp0_stats["max"].round(2) == 0.54

In [13]:
def get_gene_content(blood_stats, pred_stats, gene_name, gene_template, cell_type=None):
    """
    Returns a string (from a template) with the data fields filled in.
    """
    s = partial(
        gene_template.format,
        gene=gene_name,
        blood_min=format_number(blood_stats["min"]),
        blood_avg=format_number(blood_stats["mean"]),
        blood_max=format_number(blood_stats["max"]),
        pred_min=format_number(pred_stats["min"]),
        pred_avg=format_number(pred_stats["mean"]),
        pred_max=format_number(pred_stats["max"]),
    )

    if "{cell_type}" in gene_template and cell_type is not None:
        return s(cell_type=cell_type)

    return s()

In [14]:
# testing
_tmp_gene_cont = get_gene_content(
    _tmp0_stats, _tmp0_stats, "IFNG", GENE0_STATS_TEMPLATE, "blood"
)
assert "IFNG" in _tmp_gene_cont
assert "0.19" in _tmp_gene_cont
assert "0.42" in _tmp_gene_cont
assert "0.54" in _tmp_gene_cont
assert "blood" in _tmp_gene_cont

In [15]:
# testing
_tmp_gene_cont = get_gene_content(
    _tmp0_stats, _tmp0_stats, "IFNG", GENE1_STATS_TEMPLATE
)
assert "IFNG" in _tmp_gene_cont
assert "0.19" in _tmp_gene_cont
assert "0.42" in _tmp_gene_cont
assert "0.54" in _tmp_gene_cont

In [16]:
# testing
_tmp_gene_cont = get_gene_content(
    _tmp0_stats, _tmp0_stats, "IFNG", GENE1_STATS_TEMPLATE, "blood"
)
assert "IFNG" in _tmp_gene_cont
assert "0.19" in _tmp_gene_cont
assert "0.42" in _tmp_gene_cont
assert "0.54" in _tmp_gene_cont
assert "blood" not in _tmp_gene_cont

In [17]:
def write_content(gene0_text, gene1_text, text_replacement):
    """
    It writes the table content in the output file.
    """
    with open(OUTPUT_FILE_PATH, "r", encoding="utf8") as f:
        file_content = f.read()

    new_file_content = re.sub(
        re.escape(gene0_text) + ".+\n" + re.escape(gene1_text) + ".+\n",
        text_replacement,
        file_content,
        # flags=re.DOTALL,
    )

    with open(OUTPUT_FILE_PATH, "w", encoding="utf8") as f:
        f.write(new_file_content)

In [18]:
def format_tissue_name(tissue_name):
    s = " ".join(tissue_name.split("-"))
    s = list(s)
    s[0] = s[0].upper()
    return "".join(s)

In [19]:
# testing
assert format_tissue_name("blood") == "Blood"
assert format_tissue_name("natural-killer-cell") == "Natural killer cell"

In [20]:
def process_genes(gene0, gene1):
    """
    Given a gene pair, it updates a table in a Markdown file with statistics on their network data (GIANT),
    (such as network connectivity stats).
    """
    data_blood = read_data(gene0, gene1, "blood")
    data_pred, pred_tissue = read_data(gene0, gene1, return_predicted_tissue=True)

    # gene0
    gene_name, gene_template = (gene0, GENE0_STATS_TEMPLATE)
    blood_stats = get_gene_stats(data_blood, gene_name).rename(f"{gene_name} - blood")
    display(blood_stats)

    pred_stats = get_gene_stats(data_pred, gene_name).rename(f"{gene_name} - pred")
    display(pred_stats)

    new_content = (
        get_gene_content(
            blood_stats,
            pred_stats,
            gene_name,
            gene_template,
            format_tissue_name(pred_tissue),
        )
        + "\n"
    )

    gene0_old_match = GENE_FILE_MARK_TEMPLATE.format(gene=gene_name)

    # gene1
    gene_name, gene_template = (gene1, GENE1_STATS_TEMPLATE)
    blood_stats = get_gene_stats(data_blood, gene_name).rename(f"{gene_name} - blood")
    display(blood_stats)

    pred_stats = get_gene_stats(data_pred, gene_name).rename(f"{gene_name} - pred")
    display(pred_stats)

    new_content = new_content + (
        get_gene_content(
            blood_stats,
            pred_stats,
            gene_name,
            gene_template,
            format_tissue_name(pred_tissue),
        )
        + "\n"
    )

    gene1_old_match = GENE_FILE_MARK_TEMPLATE.format(gene=gene_name)

    write_content(gene0_old_match, gene1_old_match, new_content)

# Run

Here I update the table for some gene pairs of interest in the manuscript.

## IFNG - SDS

In [21]:
process_genes("IFNG", "SDS")

count    15.000000
mean      0.416329
std       0.084663
min       0.186702
25%       0.384790
50%       0.416825
75%       0.466691
max       0.539424
Name: IFNG - blood, dtype: float64

count    15.000000
mean      0.895806
std       0.087539
min       0.742973
25%       0.831290
50%       0.920374
75%       0.972659
max       0.990921
Name: IFNG - pred, dtype: float64

count    7.000000
mean     0.287983
std      0.073519
min      0.180309
25%      0.253745
50%      0.274899
75%      0.319880
max      0.413422
Name: SDS - blood, dtype: float64

count    13.000000
mean      0.805204
std       0.099726
min       0.645825
25%       0.762556
50%       0.812201
75%       0.867130
max       0.944408
Name: SDS - pred, dtype: float64

## PRSS36 - CCL18

In [22]:
process_genes("PRSS36", "CCL18")

count    11.000000
mean      0.098798
std       0.023163
min       0.071462
25%       0.083544
50%       0.090087
75%       0.110335
max       0.141485
Name: PRSS36 - blood, dtype: float64

count    11.000000
mean      0.053890
std       0.013835
min       0.035443
25%       0.041956
50%       0.058732
75%       0.060471
max       0.082049
Name: PRSS36 - pred, dtype: float64

count    16.000000
mean      0.735091
std       0.182433
min       0.071462
25%       0.748864
50%       0.784038
75%       0.804505
max       0.855855
Name: CCL18 - blood, dtype: float64

count    16.000000
mean      0.689526
std       0.183743
min       0.053119
25%       0.667870
50%       0.726080
75%       0.781688
max       0.901092
Name: CCL18 - pred, dtype: float64

## UTY - KDM6A

In [23]:
process_genes("UTY", "KDM6A")

count    12.000000
mean      0.364744
std       0.349849
min       0.028706
25%       0.033048
50%       0.279625
75%       0.689936
max       0.841813
Name: UTY - blood, dtype: float64

count    8.000000
mean     0.026860
std      0.009897
min      0.013992
25%      0.018840
50%      0.026702
75%      0.032803
max      0.043308
Name: UTY - pred, dtype: float64

count    11.000000
mean      0.424317
std       0.197021
min       0.032555
25%       0.450170
50%       0.490905
75%       0.533079
max       0.577282
Name: KDM6A - blood, dtype: float64

count    16.000000
mean      0.382091
std       0.117187
min       0.035619
25%       0.361125
50%       0.375271
75%       0.400988
max       0.605991
Name: KDM6A - pred, dtype: float64

## DDX3Y - KDM6A

In [24]:
process_genes("DDX3Y", "KDM6A")

count    12.000000
mean      0.329387
std       0.320418
min       0.052588
25%       0.063631
50%       0.099080
75%       0.639423
max       0.779259
Name: DDX3Y - blood, dtype: float64

count    7.000000
mean     0.114175
std      0.037560
min      0.066251
25%      0.091901
50%      0.109789
75%      0.129124
max      0.181132
Name: DDX3Y - pred, dtype: float64

count    10.000000
mean      0.505220
std       0.047750
min       0.429831
25%       0.475295
50%       0.505804
75%       0.533458
max       0.577282
Name: KDM6A - blood, dtype: float64

count    15.000000
mean      0.336037
std       0.054447
min       0.267148
25%       0.291450
50%       0.345921
75%       0.360573
max       0.477126
Name: KDM6A - pred, dtype: float64

## RASSF2 - CYTIP

In [25]:
process_genes("RASSF2", "CYTIP")

count    13.000000
mean      0.773071
std       0.060301
min       0.689823
25%       0.718106
50%       0.772075
75%       0.798691
max       0.901809
Name: RASSF2 - blood, dtype: float64

count    14.000000
mean      0.744624
std       0.057027
min       0.662964
25%       0.705589
50%       0.741745
75%       0.773550
max       0.882205
Name: RASSF2 - pred, dtype: float64

count    15.000000
mean      0.853255
std       0.050279
min       0.735844
25%       0.819891
50%       0.869117
75%       0.891643
max       0.912679
Name: CYTIP - blood, dtype: float64

count    15.000000
mean      0.843293
std       0.044863
min       0.759377
25%       0.819578
50%       0.843218
75%       0.869837
max       0.914080
Name: CYTIP - pred, dtype: float64

## MYOZ1 - TNNI2

In [26]:
process_genes("MYOZ1", "TNNI2")

count    10.000000
mean      0.171077
std       0.103562
min       0.091422
25%       0.102670
50%       0.113056
75%       0.240818
max       0.367188
Name: MYOZ1 - blood, dtype: float64

count    10.000000
mean      0.108229
std       0.003438
min       0.105677
25%       0.106864
50%       0.106883
75%       0.106883
max       0.115654
Name: MYOZ1 - pred, dtype: float64

count    14.000000
mean      0.219337
std       0.083911
min       0.102468
25%       0.179536
50%       0.221226
75%       0.230911
max       0.436661
Name: TNNI2 - blood, dtype: float64

count    15.000000
mean      0.114366
std       0.003559
min       0.104635
25%       0.114070
50%       0.114303
75%       0.114303
max       0.122063
Name: TNNI2 - pred, dtype: float64

## SCGB3A1 - C19orf33

In [27]:
process_genes("SCGB3A1", "C19orf33")

count    10.000000
mean      0.193822
std       0.025327
min       0.155134
25%       0.176369
50%       0.193143
75%       0.217271
max       0.228326
Name: SCGB3A1 - blood, dtype: float64

count    7.000000
mean     0.110496
std      0.004439
min      0.105638
25%      0.109367
50%      0.109585
75%      0.109763
max      0.119991
Name: SCGB3A1 - pred, dtype: float64

count    8.000000
mean     0.192866
std      0.039739
min      0.148565
25%      0.169996
50%      0.189305
75%      0.200146
max      0.279338
Name: C19orf33 - blood, dtype: float64

count    14.000000
mean      0.119744
std       0.019328
min       0.105638
25%       0.108910
50%       0.115052
75%       0.118435
max       0.174446
Name: C19orf33 - pred, dtype: float64