<a href="https://colab.research.google.com/github/connyyu/find_PDBs/blob/main/find_PDBs_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Find all PDBs associated with UniProt AC.

Conny Yu, 08/2024<br><br>
Using UniProt API, find all PDBs associated to the UniProt entry.<br>
Using PDB API, find all references (including preprints) and their PMIDs associated to each PDB.<br>
Input: UniProt AC.

In [None]:
# @title Enter UniProt AC. {"vertical-output":true}
uniprot_ac = "P67775" # @param {"type":"string"}
import requests
from collections import defaultdict

def get_pdb_codes(uniprot_ac):
    # Query UniProt API to get the PDB information
    uniprot_url = f"https://www.uniprot.org/uniprot/{uniprot_ac}.xml"
    response = requests.get(uniprot_url)

    if response.status_code != 200:
        print(f"\033[1mError fetching data from UniProt for AC {uniprot_ac}\033[0m")
        return []

    xml_data = response.text

    # Extract the PDB codes from the XML data
    pdb_codes = set()
    for line in xml_data.splitlines():
        if 'dbReference type="PDB"' in line:
            start = line.find('id="') + 4
            end = line.find('"', start)
            pdb_code = line[start:end]
            pdb_codes.add(pdb_code)

    return list(pdb_codes)

def get_pdb_pmids(pdb_code):
    # Query RCSB PDB API to get references for the PDB code
    pdb_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_code}"
    response = requests.get(pdb_url)

    if response.status_code != 200:
        print(f"\033[1mError fetching data from RCSB PDB for PDB code {pdb_code}\033[0m")
        return []

    pdb_data = response.json()

    # Extract PMIDs from the PDB data
    pmids = set()
    if 'rcsb_primary_citation' in pdb_data:
        primary_citation = pdb_data['rcsb_primary_citation']
        if 'pdbx_database_id_pub_med' in primary_citation:
            pmids.add(primary_citation['pdbx_database_id_pub_med'])

    if 'citation' in pdb_data:
        for citation in pdb_data['citation']:
            if 'pdbx_database_id_pub_med' in citation:
                pmids.add(citation['pdbx_database_id_pub_med'])

    return list(pmids)

def main():
    global uniprot_ac
#    uniprot_ac = input("Enter UniProt AC: ").strip()

    # Get the list of PDB codes associated with the given UniProt AC number
    pdb_codes = get_pdb_codes(uniprot_ac)

    pmid_to_pdb = defaultdict(list)
    all_pmids = set()

    if pdb_codes:
        for pdb_code in pdb_codes:
            pmids = get_pdb_pmids(pdb_code)
            for pmid in pmids:
                pmid_to_pdb[pmid].append(pdb_code)
                all_pmids.add(pmid)

        print(f"\033[1mPMIDs and associated PDB codes for UniProt AC {uniprot_ac}:\033[0m")
        for pmid, pdb_list in pmid_to_pdb.items():
            print(f"PMID: {pmid}")
            print(f"  Associated PDBs: {', '.join(pdb_list)}")

        # All associated PDBs
        print("\n\033[1mAll associated PDBs:\033[0m")
        print(", ".join(str(pdb_code).lower() for pdb_code in pdb_codes))

        # Output for Asterix GM
        print("\n\033[1mAll associated references:\033[0m")
        print(", ".join(str(pmid) for pmid in all_pmids))

        # Output for PyMOL command
        print("\n\033[1mTo use in Pymol command:\033[0m")
        print("fetch " + "; fetch ".join(str(pdb_code).lower() for pdb_code in pdb_codes))
    else:
        print(f"No PDB codes found for UniProt AC {uniprot_ac}.")

if __name__ == "__main__":
    main()


[1mPMIDs and associated PDB codes for UniProt AC P67775:[0m
PMID: 29273778
  Associated PDBs: 5W0W
PMID: 17055435
  Associated PDBs: 2IE3, 2IE4
PMID: 38570683
  Associated PDBs: 8RBX, 8RC4, 8RBZ
PMID: 35924897
  Associated PDBs: 7SOY
PMID: 33633399
  Associated PDBs: 7K36
PMID: 17174897
  Associated PDBs: 2NYL, 2NPP, 2NYM
PMID: 38123684
  Associated PDBs: 8TTB, 8TWI, 8TWE, 8SO0
PMID: 18922469
  Associated PDBs: 3DW8
PMID: 19916524
  Associated PDBs: 3K7W, 3K7V
PMID: 29872004
  Associated PDBs: 8TWI, 8TWE, 8SO0
PMID: 19716788
  Associated PDBs: 3FGA
PMID: 18394995
  Associated PDBs: 3C5W
PMID: 24100351
  Associated PDBs: 4LAC
PMID: 38150499
  Associated PDBs: 8U1X, 8U89
PMID: 23752926
  Associated PDBs: 4I5L, 4I5N
PMID: 23591866
  Associated PDBs: 4IYP
PMID: 32315618
  Associated PDBs: 6NTS
PMID: 38582449
  Associated PDBs: 8UWB
PMID: 33243860
  Associated PDBs: 7CUN
PMID: 25003389
  Associated PDBs: 4NY3
PMID: 17086192
  Associated PDBs: 2IAE
PMID: 36869814
  Associated PDBs: 7YCX
PM