# Description

Read the top gene pairs prioritized by each correlation coefficient and predict a tissue-specific network using the web services provided by GIANT/HumanBase (https://hb.flatironinstitute.org/). Then it saves the network in files for later processing.

# Modules

In [1]:
import pandas as pd
from tqdm import tqdm

from ccc import conf
from ccc.giant import get_network

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX

In [3]:
N_TOP_GENE_PAIRS = 100

# Paths

In [4]:
INPUT_DIR = DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
display(INPUT_DIR)

assert INPUT_DIR.exists()

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections')

In [5]:
OUTPUT_DIR = conf.GIANT["RESULTS_DIR"] / "intersection_genes"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/giant/intersection_genes')

In [6]:
TISSUE_SPECIFIC_URLS = {
    "blood": ("blood", "http://hb.flatironinstitute.org/api/integrations/blood/"),
}

# Load gene maps

These gene mappings include only query genes (gene pairs).

In [7]:
gene_id_mappings = pd.read_pickle(OUTPUT_DIR / "gene_map-symbol_to_entrezid.pkl")

In [8]:
gene_id_mappings.shape

(2878, 2)

In [9]:
gene_id_mappings.head()

Unnamed: 0,SYMBOL,ENTREZID
2,C6orf89,221477
3,TRMT1L,81627
5,SMCHD1,23347
6,LIPF,8513
7,FLNC,2318


In [10]:
gene_symbol_to_entrezid = gene_id_mappings.set_index("SYMBOL").squeeze().to_dict()

In [11]:
gene_entrezid_to_symbol = gene_id_mappings.set_index("ENTREZID").squeeze().to_dict()

In [12]:
gene_id_mappings.set_index("SYMBOL").loc["ZDHHC12"]

ENTREZID    84885
Name: ZDHHC12, dtype: object

# Functions

In [13]:
def convert_gene_pairs(gene_pairs, convert_to_entrezid=False):
    """
    Converts gene pair information (as dataframe) into a suitable format for the function process_tissue_networks.
    """
    gene_pairs = gene_pairs.reset_index()

    if convert_to_entrezid:
        gene_pairs = gene_pairs.replace(
            {
                "level_0": gene_symbol_to_entrezid,
                "level_1": gene_symbol_to_entrezid,
            }
        )

    gene_pairs = gene_pairs[["level_0", "level_1"]].itertuples(index=False, name=None)

    return list(gene_pairs)

In [14]:
def process_tissue_networks(gene_pairs, output_directory, force_tissue=None):
    """
    Given a list of tuples with gene pairs, it uses the GIANT web services to predict a
    relevant tissue for each gene pair and its gene network. Then it saves all the genes
    in the networks with their edges' values.

    If force_tissue is None, then autodetect the cell type for gene pairs.
    Otherwise, force_tissue should be a string, which will be used as key to query in
    dictionary TISSUE_SPECIFIC_URLS.
    """
    with tqdm(total=min(N_TOP_GENE_PAIRS, len(gene_pairs)), ncols=100) as pbar:
        gp_idx = 0

        while pbar.n < N_TOP_GENE_PAIRS and gp_idx < len(gene_pairs):
            gp = gene_pairs[gp_idx]

            pbar.set_description(",".join(gp))

            # check whether file already exists
            suffix = ""
            if force_tissue is not None:
                suffix = f"-{force_tissue}"

            output_filepath = (
                output_directory
                / f"{gp_idx:03d}-{gp[0].lower()}_{gp[1].lower()}{suffix}.h5"
            )
            if output_filepath.exists():
                output_filepath.touch()

                gp_idx += 1
                pbar.update(1)
                continue

            output_directory.mkdir(exist_ok=True, parents=True)

            # predict a network for a gene pair
            _res = get_network(
                gene_symbols=gp,
                gene_ids_mappings=gene_id_mappings,
                tissue=TISSUE_SPECIFIC_URLS[force_tissue]
                if force_tissue is not None
                else None,
            )
            if _res is None:
                gp_idx += 1
                continue

            df, tissue, mincut = _res

            assert not df.isna().any().any()

            with pd.HDFStore(output_filepath, mode="w", complevel=4) as store:
                store.put("data", df, format="table")

                metadata = pd.DataFrame(
                    {
                        "tissue": tissue,
                        "mincut": mincut,
                    },
                    index=[0],
                )
                store.put("metadata", metadata, format="table")

            gp_idx += 1
            pbar.update(1)

# Predict tissue for each gene pair

## Custom gene pairs from Figure 3

In [15]:
gene_pairs = [
    ("IFNG", "SDS"),
    ("PRSS36", "CCL18"),
    ("UTY", "KDM6A"),
    # ("DDX3Y", "KDM6A"),
    ("RASSF2", "CYTIP"),
    ("MYOZ1", "TNNI2"),
    ("SCGB3A1", "C19orf33"),
]

display(len(gene_pairs))

6

### Autodetected cell type

In [16]:
output_dir = OUTPUT_DIR / "custom" / "autopredicted_cell_type"

In [17]:
process_tissue_networks(gene_pairs, output_dir)

SCGB3A1,C19orf33: 100%|███████████████████████████████████████████████| 6/6 [00:08<00:00,  1.46s/it]


### Blood

In [18]:
output_dir = OUTPUT_DIR / "custom" / "blood"

In [19]:
process_tissue_networks(
    gene_pairs
    + [
        ("DDX3Y", "KDM6A"),
    ],
    output_dir,
    force_tissue="blood",
)

DDX3Y,KDM6A: 100%|████████████████████████████████████████████████████| 7/7 [00:03<00:00,  1.96it/s]


## CCC vs Pearson

In [20]:
output_dir = OUTPUT_DIR / "clustermatch_vs_pearson"

In [21]:
# read gene pairs
data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_pearson.pkl").sort_values(
    "ccc", ascending=False
)

In [22]:
data.shape

(20951, 11)

In [23]:
data.head()

Unnamed: 0,Unnamed: 1,ccc,pearson,spearman,clustermatch_rank,pearson_rank,spearman_rank,ccc_fdr,pearson_fdr,spearman_fdr,understudied_score,min_n_pubs
TBCC,IFNG,0.468202,0.076129,0.544895,11989835.0,2595765.0,8531829.0,1e-06,0.047159,2.91092e-59,-inf,1619.0
NRADDP,SERPINH1,0.460965,0.111842,0.540108,11949532.0,3594819.0,8473305.0,1e-06,0.003633,4.605633e-58,0.07289,110.0
CCL18,ZDHHC12,0.446659,0.099853,0.560171,11863840.5,3274368.0,8717179.0,1e-06,0.009325,3.237273e-63,0.187653,11.0
NRADDP,BAG3,0.43959,0.096174,0.50206,11817984.0,3173235.0,8003678.0,1e-06,0.012227,3.571766e-49,-0.576042,153.0
GLIPR1,IFNG,0.43959,0.096601,0.532489,11817984.0,3184937.0,8380146.0,1e-06,0.011862,3.421468e-56,0.002336,32.0


In [24]:
# make sure gene pairs are statistically significant
data = data[data["ccc_fdr"] < 0.05]

In [25]:
data.shape

(1944, 11)

From the `data` dataframe, only gene pairs (index) are used.
The other numbers are correlation values and their rankings.

In [26]:
gene_pairs = convert_gene_pairs(data)
display(len(gene_pairs))

1944

In [27]:
gene_pairs[:10]

[('TBCC', 'IFNG'),
 ('NRADDP', 'SERPINH1'),
 ('CCL18', 'ZDHHC12'),
 ('NRADDP', 'BAG3'),
 ('GLIPR1', 'IFNG'),
 ('NRADDP', 'DNAJA4'),
 ('TNFSF14', 'SERPINH1'),
 ('TMEM80', 'IFNG'),
 ('LXN', 'IFNG'),
 ('NRADDP', 'HSPB1')]

In [28]:
process_tissue_networks(gene_pairs, output_dir)

ZDHHC12,SPP1: 100%|███████████████████████████████████████████████| 100/100 [02:22<00:00,  1.42s/it]


## CCC vs Pearson/Spearman

In [29]:
output_dir = OUTPUT_DIR / "clustermatch_vs_pearson_spearman"

In [30]:
data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_pearson_spearman.pkl").sort_values(
    "ccc", ascending=False
)

In [31]:
data.shape

(8, 11)

In [32]:
data.head()

Unnamed: 0,Unnamed: 1,ccc,pearson,spearman,clustermatch_rank,pearson_rank,spearman_rank,ccc_fdr,pearson_fdr,spearman_fdr,understudied_score,min_n_pubs
DNAJC5,GTPBP1,0.193464,0.107576,0.156998,8979610.0,3482451.0,3018450.0,1e-06,0.005132,1.810946e-05,0.332355,27.0
RPL32P3,PRDX3P1,0.193464,0.073375,0.186682,8979610.0,2512718.0,3534806.0,1e-06,0.055651,3.071763e-07,-inf,inf
BLOC1S3,RP11-4B16.4,0.188812,0.028556,0.192653,8897293.5,1043401.0,3636575.0,1e-06,0.463019,1.243621e-07,-0.033741,14.0
AC009950.2,PRDX3P1,0.184279,0.070892,0.197203,8823932.0,2437032.0,3713281.0,1e-06,0.064512,6.12107e-08,-inf,inf
CYTH4,GTPBP1,0.184217,0.087254,0.183848,8813629.5,2922303.0,3486726.0,1e-06,0.022883,4.6695e-07,0.593615,16.0


In [33]:
# make sure gene pairs are statistically significant
data = data[data["ccc_fdr"] < 0.05]

In [34]:
data.shape

(8, 11)

In [35]:
gene_pairs = convert_gene_pairs(data)
display(len(gene_pairs))

8

In [36]:
gene_pairs[:10]

[('DNAJC5', 'GTPBP1'),
 ('RPL32P3', 'PRDX3P1'),
 ('BLOC1S3', 'RP11-4B16.4'),
 ('AC009950.2', 'PRDX3P1'),
 ('CYTH4', 'GTPBP1'),
 ('KLHL21', 'AC068580.6'),
 ('C17orf53', 'TPX2'),
 ('KIAA0232', 'PRDX3P1')]

In [37]:
process_tissue_networks(gene_pairs, output_dir)

KIAA0232,PRDX3P1:  25%|███████████▊                                   | 2/8 [00:03<00:10,  1.81s/it]


## CCC vs Spearman

In [38]:
output_dir = OUTPUT_DIR / "clustermatch_vs_spearman"

In [39]:
data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_spearman.pkl").sort_values(
    "ccc", ascending=False
)

In [40]:
data.shape

(28, 11)

In [41]:
data.head()

Unnamed: 0,Unnamed: 1,ccc,pearson,spearman,clustermatch_rank,pearson_rank,spearman_rank,ccc_fdr,pearson_fdr,spearman_fdr,understudied_score,min_n_pubs
KDM6A,UTY,0.294391,0.23987,0.100621,10436674.0,6265429.0,1987841.0,1e-06,6.547223e-11,0.006483454,-0.326795,21
CYTIP,KIAA0040,0.205803,0.158792,0.110038,9212181.0,4717121.0,2164247.0,1e-06,3.044918e-05,0.002858802,0.330448,20
CYTIP,RASSF2,0.201962,0.15606,0.107882,9132185.0,4656609.0,2124338.0,1e-06,4.304138e-05,0.00346978,0.330448,20
RPS4X,PRKY,0.19859,0.361447,0.197205,9071073.0,7945595.0,3713308.0,1e-06,2.865687e-24,6.120758e-08,0.033703,86
KDM6A,DDX3Y,0.193557,0.237462,0.03535,8989374.0,6225427.0,716931.0,1e-06,1.045469e-10,0.3467519,-0.326795,40


In [42]:
# make sure gene pairs are statistically significant
data = data[data["ccc_fdr"] < 0.05]

In [43]:
data.shape

(28, 11)

In [44]:
gene_pairs = convert_gene_pairs(data)
display(len(gene_pairs))

28

In [45]:
gene_pairs[:10]

[('KDM6A', 'UTY'),
 ('CYTIP', 'KIAA0040'),
 ('CYTIP', 'RASSF2'),
 ('RPS4X', 'PRKY'),
 ('KDM6A', 'DDX3Y'),
 ('RIPK1', 'GTPBP1'),
 ('CNN2', 'GTPBP1'),
 ('CCSER2', 'CYB561D1'),
 ('TECPR2', 'GTPBP1'),
 ('IRGQ', 'CYB561D1')]

In [46]:
process_tissue_networks(gene_pairs, output_dir)

CCSER2,WBP1L:  86%|██████████████████████████████████████████       | 24/28 [00:32<00:05,  1.37s/it]


## Pearson vs CCC

In [47]:
output_dir = OUTPUT_DIR / "pearson_vs_clustermatch"

In [48]:
data = pd.read_pickle(INPUT_DIR / "pearson_vs_clustermatch.pkl").sort_values(
    "pearson", ascending=False
)

In [49]:
data.shape

(1075, 11)

In [50]:
data.head()

Unnamed: 0,Unnamed: 1,ccc,pearson,spearman,clustermatch_rank,pearson_rank,spearman_rank,ccc_fdr,pearson_fdr,spearman_fdr,understudied_score,min_n_pubs
TNNI2,MYOZ1,0.034593,0.967834,0.284206,3714425.5,12497377.0,5069769.0,1e-06,0.0,2.583831e-15,-0.107437,21.0
MYBPC2,TNNI2,0.034593,0.965012,0.314172,3714425.5,12497343.0,5504793.0,1e-06,0.0,1.486414e-18,0.185269,16.0
MYOZ1,PYGM,0.032751,0.964681,0.248365,3556384.0,12497334.0,4532919.0,5e-06,0.0,6.505173e-12,-0.107437,21.0
MYBPC2,PYGM,0.032234,0.957893,0.234647,3499276.0,12497205.0,4320035.0,3e-06,0.0,9.601925e-11,0.185269,16.0
MYLPF,PYGM,0.027118,0.955315,0.248569,2995235.0,12497111.0,4536023.0,4e-06,0.0,6.243948e-12,0.544017,12.0


In [51]:
# make sure gene pairs are statistically significant
data = data[data["pearson_fdr"] < 0.05]

In [52]:
data.shape

(876, 11)

In [53]:
gene_pairs = convert_gene_pairs(data)
display(len(gene_pairs))

876

In [54]:
gene_pairs[:10]

[('TNNI2', 'MYOZ1'),
 ('MYBPC2', 'TNNI2'),
 ('MYOZ1', 'PYGM'),
 ('MYBPC2', 'PYGM'),
 ('MYLPF', 'PYGM'),
 ('TNNI2', 'CKM'),
 ('TNNI2', 'ACTA1'),
 ('COX7A1', 'TNNI2'),
 ('MYH2', 'PYGM'),
 ('PYGM', 'MYL2')]

In [55]:
process_tissue_networks(gene_pairs, output_dir)

TMEM38A,ENO3: 100%|███████████████████████████████████████████████| 100/100 [02:16<00:00,  1.37s/it]


## Pearson vs CCC/Spearman

In [56]:
output_dir = OUTPUT_DIR / "pearson_vs_clustermatch_spearman"

In [57]:
data = pd.read_pickle(INPUT_DIR / "pearson_vs_clustermatch_spearman.pkl").sort_values(
    "pearson", ascending=False
)

In [58]:
data.shape

(531, 11)

In [59]:
data.head()

Unnamed: 0,Unnamed: 1,ccc,pearson,spearman,clustermatch_rank,pearson_rank,spearman_rank,ccc_fdr,pearson_fdr,spearman_fdr,understudied_score,min_n_pubs
CCL20,SCGB3A1,0.031833,0.988292,0.187233,3471638.5,12497493.0,3544359.0,4e-06,0.0,2.829948e-07,-0.268826,41.0
CXCL1,SCGB3A1,0.007038,0.975534,0.078267,457775.5,12497441.0,1561424.0,0.057541,0.0,0.03490012,-0.268826,41.0
TNNI2,TPM2,0.026588,0.948215,0.193457,2930710.0,12496755.0,3650290.0,8e-06,0.0,1.098464e-07,0.134047,38.0
TPM2,PYGM,0.034912,0.94443,0.029852,3745874.0,12496416.0,605603.0,1e-06,0.0,0.4276798,0.134047,69.0
KRT19,CXCL1,0.008663,0.939751,0.097291,664617.0,12495782.0,1924833.0,0.019337,0.0,0.008512263,-0.142761,187.0


In [60]:
# make sure gene pairs are statistically significant
data = data[data["pearson_fdr"] < 0.05]

In [61]:
data.shape

(531, 11)

In [62]:
gene_pairs = convert_gene_pairs(data)
display(len(gene_pairs))

531

In [63]:
gene_pairs[:10]

[('CCL20', 'SCGB3A1'),
 ('CXCL1', 'SCGB3A1'),
 ('TNNI2', 'TPM2'),
 ('TPM2', 'PYGM'),
 ('KRT19', 'CXCL1'),
 ('ENO3', 'PYGM'),
 ('TACSTD2', 'SCGB3A1'),
 ('CCL20', 'KRT19'),
 ('S100A16', 'CXCL1'),
 ('CCL20', 'TPPP3')]

In [64]:
process_tissue_networks(gene_pairs, output_dir)

CAPS,ID1: 100%|███████████████████████████████████████████████████| 100/100 [02:13<00:00,  1.34s/it]
