.ipynb script for fetching contextual information and relevant pdb files for cN-forming Asn/Gln. The script queries contextual information (gene, protein name, etc.) from the UniProt API and relevant PDBs from the PDBe (Protein Data Bank in Europe) API. 

For each provided Uniprot accession and amino acid position, the script fetches all PDB codes with resolution finer than 3.5 angstroms that contain the specified amino acid. The output JSON file containing the PDB codes is then used for automated ChimeraX analysis of each residue in relSESA_distance_calculation.py.

In [1]:
"""
Path to input excel spreadsheet containing uniprot ids and amino acid positions
The spreadsheet should have at least two columns:
    uniprot_id: The uniprot accession for the protein containing the relevant Asn/Gln residue
    aa_position: The amino acid position of the Asn/Gln residue in the protein
"""
INPUT_EXCEL_FILEPATH = ""

# Filepath for .json output file. You can omit the .json tag
OUTPUT_FILENAME = ""

# Output filepath for csv error file. Include the .csv tag
PDB_FETCHING_ERRORS_CSV_FILENAME = ""

In [None]:
# Install dependencies

%pip install requests
%pip install pandas
%pip install openpyxl
%pip install xlrd
%pip install tqdm

In [3]:
import requests

def get_sifts(uniprot_id):
    '''
    Get the list of PDB structures mapping to a UniProt accession sorted by coverage of the protein and, 
    if the same, resolution.
    https://www.ebi.ac.uk/pdbe/graph-api/pdbe_doc/#api-SIFTS-GetBestStructures
    '''
    url = f"https://www.ebi.ac.uk/pdbe/graph-api/mappings/best_structures/{uniprot_id}"
    response_data = requests.get(url).json()
    
    # Return None if SIFTS data does not exist in the database for this uniprot id
    if response_data == {}:
        return None

    return response_data

def get_best_pdb(uniprot_id, aa_position):

    print(uniprot_id)

    sifts_data = get_sifts(uniprot_id)

    # Return None if SIFTS data does not exist
    if not sifts_data:
        return None
    
    pdb_list = sifts_data[uniprot_id]

    # Return None if no (experimental) structures exist
    if len(pdb_list) == 0:
        return None
    
    def check_conditions(start, end, resolution):
        if type(resolution) != int and type(resolution) != float:
            return False
        if resolution > 3.5:
            return False
        if start < aa_position and aa_position < end:
            return True
        return False

    # Filter for PDBs that contain the AA position and have resolution <3.5 angstroms
    best_pdb_list = list(filter(lambda pdb: check_conditions(pdb['unp_start'], pdb['unp_end'], pdb["resolution"]), pdb_list))

    best_pdb_list = list(map(lambda obj: {
        'pdb_id': obj['pdb_id'], 
        'chain_id': obj['chain_id'],
        'resolution': obj['resolution'],
        'experimental_method': obj['experimental_method']
        }, best_pdb_list))
  
    return best_pdb_list
    
def get_pdb_properties(pdb_code):
    url = f"https://www.ebi.ac.uk/pdbe/api/pdb/entry/summary/{pdb_code}"
    response = requests.get(url)
    return response.json()

In [4]:
import pandas as pd

# Load reference excel file into dataframe
df = pd.read_excel(INPUT_EXCEL_FILEPATH)

In [None]:
# Query API and populate each row with the PDB data object to be used
from tqdm import tqdm

tqdm.pandas(desc="Progress")

df['pdb_data_obj'] = df.progress_apply(lambda x: get_best_pdb(x.uniprot_id, x.aa_position), axis=1)

In [6]:
def extract_pdb_obj_value(pdb_obj, index):
    if pdb_obj is not None:
        return pdb_obj.get(index)
    return None

In [None]:
errors = []

def get_gene_protein_info(uniprot_id):
    try:
        # Query Uniprot API for gene and protein names
        url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
        response_data = requests.get(url).json()

        protein = response_data['proteinDescription']['recommendedName']['fullName']['value']
        genes = list(map(lambda entry: entry['geneName']['value'], response_data['genes']))
        
        return (True, genes, protein)
    except Exception as e:
        errors.append((uniprot_id, str(e)))
        return (False, None, None)

tqdm.pandas(desc="Progress")

# Populate the gene and protein descriptions in the dataframe
df[['uniprot_query_successful', 'uniprot_genes', 'uniprot_protein']] = df.progress_apply(lambda row: pd.Series(get_gene_protein_info(row['uniprot_id'])), axis=1)

In [8]:
# Add columns for chimera processing
# Note: NaN values are converted to null when DF is converted to JSON
# If manually adding relevant columns for processing, add them as 'null' values in JSON and NOT as blank columns in Excel!
df['analyzed_position'] = float('nan')
df['analyzed_aa'] = float('nan')
df['calculated_SES'] = float('nan')
df['relSESA'] = float('nan')
df['distance'] = float('nan')
df['is_experimental'] = float('nan')
df['pdb_used'] = float('nan')
df['pdb_chain'] = float('nan')
df['pdb_resolution'] = float('nan')
df['pdb_experimental_method'] = float('nan')
df['ramachandran_phi'] = float('nan')
df['ramachandran_psi'] = float('nan')
df['ramachandran_switched_to_alphafold'] = float('nan')
df['alphafold_switch_aa_used'] = float('nan')

In [9]:
df.to_json(OUTPUT_FILENAME, indent=2)

In [10]:
# Report any errors in the uniprot computation as errors.csv. This can be cross-checked via manual search in the uniprot database

import csv

with open(PDB_FETCHING_ERRORS_CSV_FILENAME, 'w') as csvfile:
    fwriter = csv.writer(csvfile)

    for x in errors:
        fwriter.writerow(x)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=08806ca9-3319-4bb4-9ddc-71a137575411' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>