# Find all PDBs associated with UniProt AC.

Conny Yu, 10/2024<br><br>
Using PDBe API, find all the PDB associated with a UniProt entry.<br>
This includes structures released by PDB but not yet available in the UniProt entries.<br>
Using UniProt API, find all PDBs associated to the UniProt entry.<br>
Using RCSB PDB API, find all references (including preprints) and their PMIDs associated to each PDB.<br>
Input: UniProt AC.<br><br>

In [9]:
import requests
from collections import defaultdict

# Function to fetch PDB codes from PDBe
def get_pdb_codes_from_pdbe(uniprot_ac):
    pdbe_search_url = f"https://www.ebi.ac.uk/pdbe/search/pdb/select?q={uniprot_ac}&wt=json&rows=100"
    response = requests.get(pdbe_search_url)
    
    if response.status_code != 200:
        print(f"\033[1mError fetching data from PDBe for AC {uniprot_ac}\033[0m")
        return []
    
    data = response.json()
    pdb_codes = set()

    if 'response' in data and 'docs' in data['response']:
        for entry in data['response']['docs']:
            if 'pdb_id' in entry:
                pdb_codes.add(entry['pdb_id'].lower())  # Store PDB codes in lowercase for uniformity
    
    return list(pdb_codes)

# Function to fetch PDB codes from UniProt
def get_pdb_codes_from_uniprot(uniprot_ac):
    uniprot_url = f"https://rest.uniprot.org/uniprot/{uniprot_ac}.xml"
    response = requests.get(uniprot_url)
    
    if response.status_code != 200:
        print(f"\033[1mError fetching data from UniProt for AC {uniprot_ac}\033[0m")
        return []
    
    xml_data = response.text
    pdb_codes = set()
    
    for line in xml_data.splitlines():
        if 'dbReference type="PDB"' in line:
            start = line.find('id="') + 4
            end = line.find('"', start)
            pdb_code = line[start:end]
            pdb_codes.add(pdb_code.lower())  # Store PDB codes in lowercase for uniformity
    
    return list(pdb_codes)

# Function to fetch PMIDs from RCSB PDB API
def get_pdb_pmids_from_rcsb(pdb_code):
    pdb_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_code}"
    response = requests.get(pdb_url)
    
    if response.status_code != 200:
        print(f"\033[1mError fetching data from RCSB PDB for PDB code {pdb_code}\033[0m")
        return []
    
    pdb_data = response.json()
    pmids = set()
    
    if 'rcsb_primary_citation' in pdb_data:
        primary_citation = pdb_data['rcsb_primary_citation']
        if 'pdbx_database_id_pub_med' in primary_citation:
            pmids.add(primary_citation['pdbx_database_id_pub_med'])
    
    if 'citation' in pdb_data:
        for citation in pdb_data['citation']:
            if 'pdbx_database_id_pub_med' in citation:
                pmids.add(citation['pdbx_database_id_pub_med'])
    
    return list(pmids)

def main():
    # Prompt the user for the UniProt AC number
    uniprot_ac = input("Enter UniProt AC: ").strip()
    
    # Step 1: Get all PDB codes associated with the given UniProt AC from PDBe
    pdb_codes_pdbe = set(get_pdb_codes_from_pdbe(uniprot_ac))
    
    if not pdb_codes_pdbe:
        print(f"No PDB codes found for UniProt AC {uniprot_ac} in PDBe.")
        return
    
    # Step 2: Get all PDB codes associated with the given UniProt AC from UniProt
    pdb_codes_uniprot = set(get_pdb_codes_from_uniprot(uniprot_ac))
    
    # Normalize to lowercase for consistent comparison
    pdb_codes_pdbe = {pdb_code.lower() for pdb_code in pdb_codes_pdbe}
    pdb_codes_uniprot = {pdb_code.lower() for pdb_code in pdb_codes_uniprot}
    
    # PDB codes that are in PDBe but not yet in UniProt
    pdb_not_in_uniprot = pdb_codes_pdbe - pdb_codes_uniprot
    
    # PDB codes that are common between PDBe and UniProt
    pdb_in_uniprot = pdb_codes_pdbe & pdb_codes_uniprot
    
    pmid_to_pdb = defaultdict(list)
    pdb_to_pmid = {}
    all_pmids = set()

    # Step 3: Process PDB codes that are already in UniProt
    if pdb_in_uniprot:
        for pdb_code in pdb_in_uniprot:
            pmids = get_pdb_pmids_from_rcsb(pdb_code)
            for pmid in pmids:
                pmid_to_pdb[pmid].append(pdb_code)
                pdb_to_pmid[pdb_code] = pmid
                all_pmids.add(pmid)

        # Print PMIDs and associated PDB codes for those in UniProt
        print(f"\033[1mPMIDs and associated PDB codes for UniProt AC {uniprot_ac}:\033[0m")
        
        # Sort PMIDs in descending order
        for pmid in sorted(pmid_to_pdb.keys(), reverse=True):
            print(f"PMID: {pmid}")
            print(f"  Associated PDBs: {', '.join(pdb.upper() for pdb in pmid_to_pdb[pmid])}")  # Uppercase for PDB codes

        # Sort PDBs by descending order of their associated PMIDs
        sorted_pdbs = sorted(pdb_in_uniprot, key=lambda pdb: pdb_to_pmid.get(pdb, 0), reverse=True)
        
        # Print all PDBs from UniProt
        print("\n\033[1mPDBs curated in UniProt:\033[0m")
        print(", ".join(sorted_pdbs))
    
    # Step 4: Print PDB codes not in UniProt
    if pdb_not_in_uniprot:
        print("\n\033[1m\033[31mPDBs not yet available in UniProt:\033[0m")
        print(", ".join(pdb_not_in_uniprot))
    
    # Step 5: Print all associated references (PMIDs)
    if all_pmids:
        print("\n\033[1mAll associated references (PMIDs):\033[0m")
        print(", ".join(str(pmid) for pmid in sorted(all_pmids, reverse=True)))
        
        # Output for PyMOL command
        print("\n\033[1mPyMOL fetch command for PDBs in UniProt:\033[0m")
        print("fetch " + "; fetch ".join(str(pdb_code).lower() for pdb_code in sorted_pdbs))

if __name__ == "__main__":
    main()


Enter UniProt AC:  P40967


[1mPMIDs and associated PDB codes for UniProt AC P40967:[0m
PMID: 35985289
  Associated PDBs: 7PHR
PMID: 33468649
  Associated PDBs: 6VM8
PMID: 26917722
  Associated PDBs: 5EU3, 5EU6, 5EU4, 5EU5
PMID: 24108701
  Associated PDBs: 4IS6
PMID: 19789338
  Associated PDBs: 3CC5
PMID: 15814707
  Associated PDBs: 1TVB, 1TVH

[1mPDBs curated in UniProt:[0m
7phr, 6vm8, 5eu3, 5eu6, 5eu4, 5eu5, 4is6, 3cc5, 1tvb, 1tvh, 6vma, 6vm9, 6vmc, 6vm7

[1m[31mPDBs not yet available in UniProt:[0m
9jsw, 9jsv, 9jsx, 9jst, 9jsu

[1mAll associated references (PMIDs):[0m
35985289, 33468649, 26917722, 24108701, 19789338, 15814707

[1mPyMOL fetch command for PDBs in UniProt:[0m
fetch 7phr; fetch 6vm8; fetch 5eu3; fetch 5eu6; fetch 5eu4; fetch 5eu5; fetch 4is6; fetch 3cc5; fetch 1tvb; fetch 1tvh; fetch 6vma; fetch 6vm9; fetch 6vmc; fetch 6vm7
