In [1]:
import os
import glob
import pandas as pd
from pathlib import Path
import requests

In [2]:
workdir = os.path.dirname(os.getcwd())
input_dir = Path(f"{workdir}/7_protein_drugability/data/")
module_path = Path("/home/bbc8731/HSV/3_module_expansion/data/categories_methods")

# conversion table for uniprot_id to ensembl_gene_id: in modules we have uniprot_ids, but for checking drugability we need ensembl_gene_id.
ensembll_gene_ids = pd.read_csv(os.path.join(input_dir, "ensembl_gene_id.csv"))

In [3]:
# get only small molecule info from open target database
URL = "https://api.platform.opentargets.org/api/v4/graphql"

QUERY = """
query targetQuery($ensemblId: String!) {
  target(ensemblId: $ensemblId) {
    id
    approvedSymbol
    tractability {
      label
      modality
      value
    }
  }
}
"""

def get_sm_tractability(ensembl_id):
    try:
        response = requests.post(
            URL,
            json={
                "query": QUERY,
                "variables": {"ensemblId": ensembl_id}
            },
            timeout=20
        )
        response.raise_for_status()

        data = response.json()
        target = data.get("data", {}).get("target")

        if target is None:
            return None

        sm_entries = [
            {
                "ensembl_id": target["id"],
                "symbol": target["approvedSymbol"],
                "label": t["label"],
                "value": t["value"]
            }
            for t in target.get("tractability", [])
            if t["modality"] == "SM"
        ]

        return sm_entries if sm_entries else None

    except requests.exceptions.RequestException as e:
        print(f"Request failed for {ensembl_id}: {e}")
        return None

In [4]:
"""
The function checks in order:

Approved Drug → False
Advanced Clinical → False
Phase 1 Clinical → False
High-Quality Ligand → True ← STOP

So: return 4
"""

priority = [
    "Approved Drug",
    "Advanced Clinical",
    "Phase 1 Clinical",
    "High-Quality Ligand",
    "Structure with Ligand",
    "High-Quality Pocket",
    "Med-Quality Pocket",
    "Druggable Family"
]

def assign_rank(row):
    for i, label in enumerate(priority):
        if row.get(label, False):
            return i + 1   # 1 = best
    return len(priority) + 1  # worst

In [5]:
module_files = sorted(module_path.rglob("*/consensus/uniprot_ppi.tsv"))

thresholds = [4]
for p in module_files:
    base_path = p.parents[1]
    print(base_path)

    modules = pd.read_csv(p, sep=None, engine='python', comment='#')

    for cutoff in thresholds:
        filtered = modules.loc[modules['n_methods'] >= cutoff, :]
        module_genes = filtered['gene'].dropna().astype(str)
        module_genes_df = module_genes.to_frame(name="uniprot_id")
        module_genes_df = pd.merge(module_genes_df, ensembll_gene_ids, on = "uniprot_id", how = "left")

        tractability_df_wide = []
        for _, row in module_genes_df.iterrows():
            
            ensembl_id = row["ensembl_id"]
            uniprot_id = row["uniprot_id"]

            if pd.isna(ensembl_id):
                continue

            tractability = get_sm_tractability(ensembl_id)
            
            if not tractability:   
                continue
            
            tractability_df = pd.DataFrame(tractability)
            df_wide = (tractability_df.pivot(index=["ensembl_id", "symbol"], columns="label", values="value").fillna(False).reset_index())
            
            df_wide["uniprot_id"] = uniprot_id
            tractability_df_wide.append(df_wide)

    
        output_dir = base_path / "drugability"
        output_dir.mkdir(parents=True, exist_ok=True)

        final_df = pd.concat(tractability_df_wide, ignore_index=True)
        
        # Keep rows where at least one tractability column is True
        tract_cols = final_df.columns.difference(["ensembl_id", "symbol", "uniprot_id"])
        filtered_df = final_df[final_df[tract_cols].any(axis=1)].copy()


        filtered_df["druggability_rank"] = filtered_df.apply(assign_rank, axis=1)
        ranked_df = filtered_df.sort_values("druggability_rank")

        output_file = output_dir / f"protein_drugability_cutoff_{cutoff}.csv"
        ranked_df.to_csv(output_file, index=False)


/home/bbc8731/HSV/3_module_expansion/data/categories_methods/BP_Assembly_and_Packaging
/home/bbc8731/HSV/3_module_expansion/data/categories_methods/BP_Egress_and_Envelopment
/home/bbc8731/HSV/3_module_expansion/data/categories_methods/BP_Entry_and_Uncoating
/home/bbc8731/HSV/3_module_expansion/data/categories_methods/BP_Immune_Evasion
/home/bbc8731/HSV/3_module_expansion/data/categories_methods/BP_Replication_and_Transcription
/home/bbc8731/HSV/3_module_expansion/data/categories_methods/BP_Uncharacterized
/home/bbc8731/HSV/3_module_expansion/data/categories_methods/CC_Host_Cytoplasm
/home/bbc8731/HSV/3_module_expansion/data/categories_methods/CC_Host_Membrane
/home/bbc8731/HSV/3_module_expansion/data/categories_methods/CC_Host_Nucleus
/home/bbc8731/HSV/3_module_expansion/data/categories_methods/CC_Virion_Components


In [6]:
ranked_df

label,ensembl_id,symbol,Advanced Clinical,Approved Drug,Druggable Family,High-Quality Ligand,High-Quality Pocket,Med-Quality Pocket,Phase 1 Clinical,Structure with Ligand,uniprot_id,druggability_rank
54,ENSG00000068024,HDAC4,False,True,True,True,False,True,False,True,P56524,1
51,ENSG00000196591,HDAC2,False,True,True,True,False,True,False,True,Q92769,1
36,ENSG00000165916,PSMC3,False,True,False,True,False,False,False,True,P17980,1
53,ENSG00000116478,HDAC1,False,True,True,True,False,False,False,True,Q13547,1
48,ENSG00000120899,PTK2B,True,False,True,True,True,False,False,True,Q14289,2
49,ENSG00000102096,PIM2,False,False,True,True,True,False,True,True,Q9P1W9,3
52,ENSG00000168036,CTNNB1,False,False,True,True,False,False,False,True,P35222,4
39,ENSG00000165280,VCP,False,False,True,True,True,False,False,True,P55072,4
22,ENSG00000112237,CCNC,False,False,False,True,False,False,False,True,P24863,4
21,ENSG00000177463,NR2C2,False,False,True,True,False,True,False,False,P49116,4
