# Description

TODO

# Modules

In [1]:
# %load_ext rpy2.ipython

In [2]:
import json
import tempfile
from pathlib import Path

import requests
import pandas as pd

from clustermatch import conf

# Settings

In [3]:
# DATASET_CONFIG = conf.GTEX
# GTEX_TISSUE = "whole_blood"
# GENE_SEL_STRATEGY = "var_pc_log2"

In [4]:
# # this is used for the cumulative histogram
# GENE_PAIRS_PERCENT = 0.70

In [5]:
# CLUSTERMATCH_LABEL = "Clustermatch"
# PEARSON_LABEL = "Pearson"
# SPEARMAN_LABEL = "Spearman"

# Paths

In [6]:
# INPUT_DIR = DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
# display(INPUT_DIR)

# assert INPUT_DIR.exists()

In [7]:
OUTPUT_DIR = conf.GIANT["RESULTS_DIR"] / "intersection_genes"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/giant/intersection_genes')

# Load gene maps

In [8]:
gene_id_mappings = pd.read_pickle(OUTPUT_DIR / "gene_map-symbol_to_entrezid.pkl")

In [9]:
gene_id_mappings.shape

(2878, 2)

In [10]:
gene_id_mappings.head()

Unnamed: 0,SYMBOL,ENTREZID
1,KDM6A,7403
2,CCDC107,203260
3,PTK2B,2185
4,RTN2,6253
5,ST20,400410


In [11]:
gene_symbol_to_entrezid = gene_id_mappings.set_index("SYMBOL").squeeze().to_dict()

In [12]:
gene_entrezid_to_symbol = gene_id_mappings.set_index("ENTREZID").squeeze().to_dict()

# Functions

In [13]:
URL_GENE_INFO = "https://hb.flatironinstitute.org/api/genes/"

In [14]:
URL_TISSUE_PREDICTION = "https://hb.flatironinstitute.org/api/integrations/relevant/"

In [15]:
def gene_exists(gene_entrez_id):
    url = URL_GENE_INFO + str(gene_entrez_id)
    r = requests.get(url)

    if r.status_code != 200:
        return False

    data = r.json()
    return "entrez" in data and "standard_name" in data

In [16]:
# testing
assert gene_exists(3458)
assert not gene_exists(100129354)

In [17]:
def predict_tissue(gene_pair_tuple):
    for gene in gene_pair_tuple:
        if not gene_exists(gene):
            return None

    params = {"entrez": list(gene_pair_tuple)}
    r = requests.post(URL_TISSUE_PREDICTION, json=params)
    data = r.json()

    # check if top tissue is brenda
    top_id = 0
    while data[top_id]["context"]["term"]["database"]["name"] != "BRENDA Ontology":
        top_id += 1

    return data[top_id]["slug"], data[top_id]["url"]

In [18]:
# testing
assert predict_tissue(("6903", "3458")) == (
    "nervous-system",
    "http://hb.flatironinstitute.org/api/integrations/nervous-system/",
)
assert predict_tissue(("100129354", "871")) is None

# cases where the top tissue is not brenda
assert predict_tissue(("3458", "10993")) == (
    "natural-killer-cell",
    "http://hb.flatironinstitute.org/api/integrations/natural-killer-cell/",
)
# FIXME: more tests needed here!

In [19]:
def rank_genes(all_genes, edges, query_gene_symbols):
    genes_query_degrees = {}
    genes_degrees = {}

    for g in all_genes:
        # connections to query genes
        g_query_genes = edges[
            ((edges["gene1"] == g) & (edges["gene2"].isin(query_gene_symbols)))
            | ((edges["gene2"] == g) & (edges["gene1"].isin(query_gene_symbols)))
        ]

        g_query_degree = g_query_genes["weight"].sum() / g_query_genes.shape[0]

        # connections to all genes
        g_all_genes = edges[(edges["gene1"] == g) | (edges["gene2"] == g)]

        g_degree = g_all_genes["weight"].sum() / g_all_genes.shape[0]

        # save
        genes_query_degrees[g] = g_query_degree
        genes_degrees[g] = g_degree

    # no degree correction
    gene_ranks = [
        (gene, idx)
        for idx, (gene, weight) in enumerate(
            sorted(genes_query_degrees.items(), key=lambda item: -item[1])
        )
    ]

    return (
        pd.DataFrame(gene_ranks)
        .set_index(0)
        .squeeze()
        .rename("rank")
        .rename_axis("gene")
    )

In [20]:
def get_network(gene_entrezids=None, gene_symbols=None, max_genes=15):
    if gene_entrezids is None and gene_symbols is None:
        raise ValueError("No arguments provided")

    if gene_entrezids is not None:
        if (
            gene_entrezids[0] not in gene_entrezid_to_symbol
            or gene_entrezids[1] not in gene_entrezid_to_symbol
        ):
            return None
        gene_symbols = (
            gene_entrezid_to_symbol[gene_entrezids[0]],
            gene_entrezid_to_symbol[gene_entrezids[1]],
        )
    else:
        gene_entrezids = gene_symbol_to_entrezid[gene_symbols[0]], gene_symbol_to_entrezid[gene_symbols[1]]

    tissue_prediction = predict_tissue(gene_entrezids)
    if tissue_prediction is None:
        return None

    print(tissue_prediction[0])

    url = tissue_prediction[1] + "network/"
    params = [("entrez", gene_entrezids[0]), ("entrez", gene_entrezids[1])]
    r = requests.get(url, params)
    data = r.json()

    mincut = data["mincut"]
    print(mincut)

    temp_dir = Path(tempfile.mkdtemp(prefix="giant-"))
    genes_json_file = temp_dir / "genes.json"
    edges_json_file = temp_dir / "edges.json"
    with open(genes_json_file, "w") as gf, open(edges_json_file, "w") as ef:
        json.dump(data["genes"], gf)
        json.dump(data["edges"], ef)

    genes = pd.read_json(genes_json_file)["standard_name"]
    edges = pd.read_json(edges_json_file)[["source", "target", "weight"]]

    df = edges.join(genes.rename("gene1"), on="source", how="left").join(
        genes.rename("gene2"), on="target", how="left"
    )[["gene1", "gene2", "weight"]]

    # df = df[df["weight"] > mincut]

    # prioritize genes
    all_genes = set(df["gene1"]).union(set(df["gene2"]))
    all_genes.remove(gene_symbols[0])
    all_genes.remove(gene_symbols[1])

    genes_ranks = rank_genes(all_genes, df, gene_symbols)
    top_genes = set(genes_ranks.head(max_genes).index)
    top_genes.update(gene_symbols)
    df = df[(df["gene1"].isin(top_genes)) & (df["gene2"].isin(top_genes))]

    return df[df["weight"] > mincut].reset_index(drop=True)

In [21]:
# testing
assert get_network(("IFNG", "SDS")) is None
# TODO

In [22]:
gene_symbols = ("IFNG", "GLIPR1")
df = get_network(gene_symbols=gene_symbols).round(4)
assert df.shape[0] == 134

pd.testing.assert_series_equal(
    df.iloc[0],
    pd.Series(["HLA-DPA1", "GBP2", 0.8386]),
    check_names=False,
    check_index=False,
)

pd.testing.assert_series_equal(
    df.iloc[54],
    pd.Series(["LCP2", "CASP1", 0.7856]),
    check_names=False,
    check_index=False,
)

pd.testing.assert_series_equal(
    df.iloc[-1],
    pd.Series(["ITGB2", "HLA-DQB1", 0.8782]),
    check_names=False,
    check_index=False,
)

blood
0.16794658994089195


In [23]:
gene_symbols = ("ZDHHC12", "CCL18")
df = get_network(gene_symbols=gene_symbols).round(4)
assert df.shape[0] == 129

pd.testing.assert_series_equal(
    df.iloc[0],
    pd.Series(["CCL3", "SCAMP2", 0.1110]),
    check_names=False,
    check_index=False,
)

pd.testing.assert_series_equal(
    df.iloc[72],
    pd.Series(["ZDHHC12", "CTSB", 0.1667]),
    check_names=False,
    check_index=False,
)

pd.testing.assert_series_equal(
    df.iloc[-1],
    pd.Series(["C1QA", "HLA-DQB1", 0.4485]),
    check_names=False,
    check_index=False,
)

macrophage
0.028307818501499497


# Predict tissue for each gene pair

## Clustermatch vs Pearson

In [26]:
data = pd.read_pickle(OUTPUT_DIR / "clustermatch_vs_pearson.pkl").sort_values(
    "clustermatch", ascending=False
)

FileNotFoundError: [Errno 2] No such file or directory: '/opt/data/results/giant/intersection_genes/clustermatch_vs_pearson.pkl'

In [25]:
data.head()

NameError: name 'data' is not defined

In [52]:
gene_pairs = (
    data.reset_index()
    .replace(
        {
            "level_0": gene_symbol_to_entrezid,
            "level_1": gene_symbol_to_entrezid,
        }
    )[["level_0", "level_1"]]
    .to_records(index=False)
)

gene_pairs = list(gene_pairs)

In [53]:
gene_pairs[:10]

[('6903', '3458'),
 ('100129354', '871'),
 ('6362', '84885'),
 ('100129354', '9531'),
 ('11010', '3458'),
 ('100129354', '55466'),
 ('8740', '871'),
 ('283232', '3458'),
 ('56925', '3458'),
 ('100129354', '3315')]

## Clustermatch vs Pearson/Spearman

In [10]:
data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_pearson_spearman.pkl")

2888

## Clustermatch vs Spearman

In [11]:
data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_spearman.pkl")

2892

## Pearson vs Clustermatch

In [12]:
data = pd.read_pickle(INPUT_DIR / "pearson_vs_clustermatch.pkl")

3224

## Pearson vs Clustermatch/Spearman

In [13]:
data = pd.read_pickle(INPUT_DIR / "pearson_vs_clustermatch_spearman.pkl")

3243