# Description

Read the top gene pairs prioritized by each correlation coefficient and predict a tissue-specific network using the web services provided by GIANT/HumanBase (https://hb.flatironinstitute.org/). Then it saves the network in files for later processing.

# Modules

In [1]:
import pandas as pd
from tqdm import tqdm

from clustermatch import conf
from clustermatch.giant import get_network

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX

In [3]:
N_TOP_GENE_PAIRS = 100

# Paths

In [4]:
INPUT_DIR = DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
display(INPUT_DIR)

assert INPUT_DIR.exists()

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections')

In [5]:
OUTPUT_DIR = conf.GIANT["RESULTS_DIR"] / "intersection_genes"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/giant/intersection_genes')

In [6]:
TISSUE_SPECIFIC_URLS = {
    "blood": ("blood", "http://hb.flatironinstitute.org/api/integrations/blood/"),
}

# Load gene maps

These gene mappings include only query genes (gene pairs).

In [7]:
gene_id_mappings = pd.read_pickle(OUTPUT_DIR / "gene_map-symbol_to_entrezid.pkl")

In [8]:
gene_id_mappings.shape

(2878, 2)

In [9]:
gene_id_mappings.head()

Unnamed: 0,SYMBOL,ENTREZID
1,RBPJ,3516
2,NPC1,4864
3,NR4A1,3164
4,CCL4,6351
5,IGHA1,3493


In [10]:
gene_symbol_to_entrezid = gene_id_mappings.set_index("SYMBOL").squeeze().to_dict()

In [11]:
gene_entrezid_to_symbol = gene_id_mappings.set_index("ENTREZID").squeeze().to_dict()

In [12]:
gene_id_mappings.set_index("SYMBOL").loc["ZDHHC12"]

ENTREZID    84885
Name: ZDHHC12, dtype: object

# Functions

In [13]:
def convert_gene_pairs(gene_pairs, convert_to_entrezid=False):
    """
    Converts gene pair information (as dataframe) into a suitable format for the function process_tissue_networks.
    """
    gene_pairs = gene_pairs.reset_index()

    if convert_to_entrezid:
        gene_pairs = gene_pairs.replace(
            {
                "level_0": gene_symbol_to_entrezid,
                "level_1": gene_symbol_to_entrezid,
            }
        )

    gene_pairs = gene_pairs[["level_0", "level_1"]].itertuples(index=False, name=None)

    return list(gene_pairs)

In [14]:
def process_tissue_networks(gene_pairs, output_directory, force_tissue=None):
    """
    Given a list of tuples with gene pairs, it uses the GIANT web services to predict a
    relevant tissue for each gene pair and its gene network. Then it saves all the genes
    in the networks with their edges' values.

    If force_tissue is None, then autodetect the cell type for gene pairs.
    Otherwise, force_tissue should be a string, which will be used as key to query in
    dictionary TISSUE_SPECIFIC_URLS.
    """
    with tqdm(total=min(N_TOP_GENE_PAIRS, len(gene_pairs)), ncols=100) as pbar:
        gp_idx = 0

        while pbar.n < N_TOP_GENE_PAIRS and gp_idx < len(gene_pairs):
            gp = gene_pairs[gp_idx]

            pbar.set_description(",".join(gp))

            # check whether file already exists
            suffix = ""
            if force_tissue is not None:
                suffix = f"-{force_tissue}"

            output_filepath = (
                output_directory
                / f"{gp_idx:03d}-{gp[0].lower()}_{gp[1].lower()}{suffix}.h5"
            )
            if output_filepath.exists():
                gp_idx += 1
                pbar.update(1)
                continue

            output_directory.mkdir(exist_ok=True, parents=True)

            # predict a network for a gene pair
            _res = get_network(
                gene_symbols=gp,
                gene_ids_mappings=gene_id_mappings,
                tissue=TISSUE_SPECIFIC_URLS[force_tissue]
                if force_tissue is not None
                else None,
            )
            if _res is None:
                gp_idx += 1
                continue

            df, tissue, mincut = _res

            assert not df.isna().any().any()

            with pd.HDFStore(output_filepath, mode="w", complevel=4) as store:
                store.put("data", df, format="table")

                metadata = pd.DataFrame(
                    {
                        "tissue": tissue,
                        "mincut": mincut,
                    },
                    index=[0],
                )
                store.put("metadata", metadata, format="table")

            gp_idx += 1
            pbar.update(1)

# Predict tissue for each gene pair

## Custom gene pairs from Figure 3

In [64]:
gene_pairs = [
    ("IFNG", "SDS"),
    ("JUN", "APOC1"),
]

display(len(gene_pairs))

2

### Autodetected cell type

In [65]:
output_dir = OUTPUT_DIR / "custom" / "autopredicted_cell_type"

In [66]:
process_tissue_networks(gene_pairs, output_dir)

JUN,APOC1: 100%|█████████████████████████████████████████████████████| 2/2 [00:00<00:00, 321.34it/s]


### Blood

In [67]:
output_dir = OUTPUT_DIR / "custom" / "blood"

In [70]:
process_tissue_networks(
    gene_pairs
    + [
        ("ZDHHC12", "CCL18"),
        ("RASSF2", "CYTIP"),
        ("MYOZ1", "TNNI2"),
        ("PYGM", "TPM2"),
    ],
    output_dir,
    force_tissue="blood",
)

PYGM,TPM2: 100%|██████████████████████████████████████████████████████| 6/6 [00:02<00:00,  2.47it/s]


## Clustermatch vs Pearson

In [20]:
output_dir = OUTPUT_DIR / "clustermatch_vs_pearson"

In [21]:
# read gene pairs
data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_pearson.pkl").sort_values(
    "clustermatch", ascending=False
)

In [22]:
data.shape

(20951, 6)

In [23]:
data.head()

Unnamed: 0,Unnamed: 1,clustermatch,pearson,spearman,clustermatch_rank,pearson_rank,spearman_rank
TBCC,IFNG,0.468202,0.076129,0.544895,11989835.0,2595765.0,8531829.0
NRADDP,SERPINH1,0.460965,0.111842,0.540108,11949532.0,3594819.0,8473305.0
CCL18,ZDHHC12,0.446659,0.099853,0.560171,11863840.5,3274368.0,8717179.0
NRADDP,BAG3,0.43959,0.096174,0.50206,11817984.0,3173235.0,8003678.0
GLIPR1,IFNG,0.43959,0.096601,0.532489,11817984.0,3184937.0,8380146.0


In [24]:
gene_pairs = convert_gene_pairs(data)
display(len(gene_pairs))

20951

In [25]:
gene_pairs[:10]

[('TBCC', 'IFNG'),
 ('NRADDP', 'SERPINH1'),
 ('CCL18', 'ZDHHC12'),
 ('NRADDP', 'BAG3'),
 ('GLIPR1', 'IFNG'),
 ('NRADDP', 'DNAJA4'),
 ('TNFSF14', 'SERPINH1'),
 ('TMEM80', 'IFNG'),
 ('LXN', 'IFNG'),
 ('NRADDP', 'HSPB1')]

In [26]:
process_tissue_networks(gene_pairs, output_dir)

ZDHHC12,SPP1: 100%|███████████████████████████████████████████████| 100/100 [09:46<00:00,  5.86s/it]


## Clustermatch vs Pearson/Spearman

In [27]:
output_dir = OUTPUT_DIR / "clustermatch_vs_pearson_spearman"

In [28]:
data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_pearson_spearman.pkl").sort_values(
    "clustermatch", ascending=False
)

In [29]:
data.shape

(8, 6)

In [30]:
data.head()

Unnamed: 0,Unnamed: 1,clustermatch,pearson,spearman,clustermatch_rank,pearson_rank,spearman_rank
DNAJC5,GTPBP1,0.193464,0.107576,0.156998,8979610.0,3482451.0,3018450.0
RPL32P3,PRDX3P1,0.193464,0.073375,0.186682,8979610.0,2512718.0,3534806.0
BLOC1S3,RP11-4B16.4,0.188812,0.028556,0.192653,8897293.5,1043401.0,3636575.0
AC009950.2,PRDX3P1,0.184279,0.070892,0.197203,8823932.0,2437032.0,3713281.0
CYTH4,GTPBP1,0.184217,0.087254,0.183848,8813629.5,2922303.0,3486726.0


In [31]:
gene_pairs = convert_gene_pairs(data)
display(len(gene_pairs))

8

In [32]:
gene_pairs[:10]

[('DNAJC5', 'GTPBP1'),
 ('RPL32P3', 'PRDX3P1'),
 ('BLOC1S3', 'RP11-4B16.4'),
 ('AC009950.2', 'PRDX3P1'),
 ('CYTH4', 'GTPBP1'),
 ('KLHL21', 'AC068580.6'),
 ('C17orf53', 'TPX2'),
 ('KIAA0232', 'PRDX3P1')]

In [33]:
process_tissue_networks(gene_pairs, output_dir)

KIAA0232,PRDX3P1:  25%|███████████▊                                   | 2/8 [00:20<01:00, 10.04s/it]


## Clustermatch vs Spearman

In [34]:
output_dir = OUTPUT_DIR / "clustermatch_vs_spearman"

In [35]:
data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_spearman.pkl").sort_values(
    "clustermatch", ascending=False
)

In [36]:
data.shape

(28, 6)

In [37]:
data.head()

Unnamed: 0,Unnamed: 1,clustermatch,pearson,spearman,clustermatch_rank,pearson_rank,spearman_rank
KDM6A,UTY,0.294391,0.23987,0.100621,10436674.0,6265429.0,1987841.0
CYTIP,KIAA0040,0.205803,0.158792,0.110038,9212181.0,4717121.0,2164247.0
CYTIP,RASSF2,0.201962,0.15606,0.107882,9132185.0,4656609.0,2124338.0
RPS4X,PRKY,0.19859,0.361447,0.197205,9071073.0,7945595.0,3713308.0
KDM6A,DDX3Y,0.193557,0.237462,0.03535,8989374.0,6225427.0,716931.0


In [38]:
gene_pairs = convert_gene_pairs(data)
display(len(gene_pairs))

28

In [39]:
gene_pairs[:10]

[('KDM6A', 'UTY'),
 ('CYTIP', 'KIAA0040'),
 ('CYTIP', 'RASSF2'),
 ('RPS4X', 'PRKY'),
 ('KDM6A', 'DDX3Y'),
 ('RIPK1', 'GTPBP1'),
 ('CNN2', 'GTPBP1'),
 ('CCSER2', 'CYB561D1'),
 ('TECPR2', 'GTPBP1'),
 ('IRGQ', 'CYB561D1')]

In [40]:
process_tissue_networks(gene_pairs, output_dir)

CCSER2,WBP1L:  86%|██████████████████████████████████████████       | 24/28 [02:32<00:25,  6.36s/it]


## Pearson vs Clustermatch

In [41]:
output_dir = OUTPUT_DIR / "pearson_vs_clustermatch"

In [42]:
data = pd.read_pickle(INPUT_DIR / "pearson_vs_clustermatch.pkl").sort_values(
    "pearson", ascending=False
)

In [43]:
data.shape

(1075, 6)

In [44]:
data.head()

Unnamed: 0,Unnamed: 1,clustermatch,pearson,spearman,clustermatch_rank,pearson_rank,spearman_rank
TNNI2,MYOZ1,0.034593,0.967834,0.284206,3714425.5,12497377.0,5069769.0
MYBPC2,TNNI2,0.034593,0.965012,0.314172,3714425.5,12497343.0,5504793.0
MYOZ1,PYGM,0.032751,0.964681,0.248365,3556384.0,12497334.0,4532919.0
MYBPC2,PYGM,0.032234,0.957893,0.234647,3499276.0,12497205.0,4320035.0
MYLPF,PYGM,0.027118,0.955315,0.248569,2995235.0,12497111.0,4536023.0


In [45]:
gene_pairs = convert_gene_pairs(data)
display(len(gene_pairs))

1075

In [46]:
gene_pairs[:10]

[('TNNI2', 'MYOZ1'),
 ('MYBPC2', 'TNNI2'),
 ('MYOZ1', 'PYGM'),
 ('MYBPC2', 'PYGM'),
 ('MYLPF', 'PYGM'),
 ('TNNI2', 'CKM'),
 ('TNNI2', 'ACTA1'),
 ('COX7A1', 'TNNI2'),
 ('MYH2', 'PYGM'),
 ('PYGM', 'MYL2')]

In [47]:
process_tissue_networks(gene_pairs, output_dir)

TMEM38A,ENO3: 100%|███████████████████████████████████████████████| 100/100 [11:02<00:00,  6.62s/it]


## Pearson vs Clustermatch/Spearman

In [48]:
output_dir = OUTPUT_DIR / "pearson_vs_clustermatch_spearman"

In [49]:
data = pd.read_pickle(INPUT_DIR / "pearson_vs_clustermatch_spearman.pkl").sort_values(
    "pearson", ascending=False
)

In [50]:
data.shape

(531, 6)

In [51]:
data.head()

Unnamed: 0,Unnamed: 1,clustermatch,pearson,spearman,clustermatch_rank,pearson_rank,spearman_rank
CCL20,SCGB3A1,0.031833,0.988292,0.187233,3471638.5,12497493.0,3544359.0
CXCL1,SCGB3A1,0.007038,0.975534,0.078267,457775.5,12497441.0,1561424.0
TNNI2,TPM2,0.026588,0.948215,0.193457,2930710.0,12496755.0,3650290.0
TPM2,PYGM,0.034912,0.94443,0.029852,3745874.0,12496416.0,605603.0
KRT19,CXCL1,0.008663,0.939751,0.097291,664617.0,12495782.0,1924833.0


In [52]:
gene_pairs = convert_gene_pairs(data)
display(len(gene_pairs))

531

In [53]:
gene_pairs[:10]

[('CCL20', 'SCGB3A1'),
 ('CXCL1', 'SCGB3A1'),
 ('TNNI2', 'TPM2'),
 ('TPM2', 'PYGM'),
 ('KRT19', 'CXCL1'),
 ('ENO3', 'PYGM'),
 ('TACSTD2', 'SCGB3A1'),
 ('CCL20', 'KRT19'),
 ('S100A16', 'CXCL1'),
 ('CCL20', 'TPPP3')]

In [54]:
process_tissue_networks(gene_pairs, output_dir)

CAPS,ID1: 100%|███████████████████████████████████████████████████| 100/100 [08:46<00:00,  5.27s/it]
