In [None]:
# General python libraries and packages
import pandas as pd
import numpy as np
import redo
from tqdm.auto import tqdm
from IPython.display import HTML
import pprint
import os
import re
import warnings # As suggested by https://github.com/mwaskom/seaborn/issues/3486
warnings.simplefilter(action="ignore", category=FutureWarning)
import datetime
today = datetime.datetime.now()
date = f'{today.year}{today.month}{today.day}'
pd.set_option('display.max_columns', 75)
pd.set_option('max_colwidth', 1000)

# Processing chemical data
from rdkit import Chem
from rdkit.Chem import Draw, PandasTools, Descriptors, rdMolDescriptors
from rdkit.Chem.MolStandardize import rdMolStandardize

# Accessing Databases
import requests
import requests_cache
import biotite.database.rcsb as rcsb
# from pypdb.clients.data.data_types import DataFetcher, DataType
# from bs4 import BeautifulSoup 
from urllib.request import urlretrieve 

# Data Visualization
import math
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
sns.set(style="whitegrid",font_scale=2)
import matplotlib.collections as clt
import ptitprince as pt

# Exploiting the PDB Search API
Code inspired by https://github.com/volkamerlab/teachopencadd/blob/master/teachopencadd/talktorials/T008_query_Data/PDB/talktorial.ipynb
and https://www.biotite-python.org/apidoc/biotite.database.rcsb.FieldQuery.html

In [None]:
# Installing a cache for the requests library, that caches the HTTP requests made using requests 
# Cache name is 'rcsb_pdb'; the cache will be stored in memory
requests_cache.install_cache("rcsb_pdb", backend="memory") 

In [None]:
def fetch_pdb_ids_by_ec_number_nochimera(ec_number, pfam):
       
    """
    Retrieve all PDB identifiers matching a query request based on the EC Number, exclude chimeric proteins.
    
    Parameters: 
        ec_number (str): The EC number that is to be queried.
        pfam(str):       The  Protein Family (pfam) name  that is to be queried.
    
    Returns: 
        (list):          PDB identifiers matching a query request.         
    """
    
    # See https://search.rcsb.org/structure-search-attributes.html for a list of all structure search attributes

    # Create a query object for the EC number, using the rcsb.FieldQuery function
    query_EC = rcsb.FieldQuery("rcsb_polymer_entity.rcsb_ec_lineage.id", exact_match=ec_number)
    
    # Create a query object for the pfam id, using the rcsb.FieldQuery function
    query_pfam = rcsb.FieldQuery("rcsb_polymer_entity_annotation.name", exact_match=pfam)

    # Create a query object for non-chimeric proteins only, using the rcsb.FieldQuery function
    # Chimeras posses a Polymer Entity Source Count greater 1. This is negated by using the ~
    query_nochimera = ~rcsb.FieldQuery("rcsb_polymer_entity.rcsb_source_part_count", greater=1)
    
    # Combine the queries with the 'and' operator to match only PDB IDs that fulfill all three conditions.
    query = rcsb.CompositeQuery([query_EC, query_pfam, query_nochimera], "and")
    
    # Print the count of PDB entries matching the defined EC number as of today
    print(f"{rcsb.count(query)} structures (excluding chimeras) with the EC Number {ec_number}, belong to the protein family {pfam}, are deposited in the PDB as of {date}. \n")

    # Use the rcsb.search function to retrieve all PDB identifiers matching the query request and return the list
    pdb_ids = rcsb.search(query)
    return pdb_ids

In [None]:
# Call the fetch_pdb_ids_by_ec_number function to retrieve all PDB identifiers for the defined EC number and Pfam Name
pdb_ids = fetch_pdb_ids_by_ec_number_nochimera("2.7.11.11", "Protein kinase domain (Pkinase)") 

print(*pdb_ids)

# Exploiting the PDB Data API to retrieve .pdb metadata

We use a GraphQL-based approach to retrieve the metadata associated with the .pdb structures. <br>
Note that we only fetch meta information on PDB structures here, we do not fetch the structures (3D coordinates).

In [None]:
def retrieve_pdb_info(pdb_id):
    
    """
    Retrieve further information about a PDB structure based on its PDB ID, exploiting the PDB-API.
    
    Parameters: 
        pdb_id (str): The PDB ID of the structure to be queried.

    Returns: 
        (dict):       A dictionary containing information about the PDB structure.         
    """
    
    # Define the GraphQL query, requesting various information about the PDB structure, as a multiline string.
    query = f""" {{ entry(entry_id: "{pdb_id}") {{ 
    rcsb_id 
    struct {{title}}
    exptl {{ method }} 
    exptl_crystal {{ density_percent_sol }} 
    exptl_crystal_grow {{ method pH pdbx_details }} 
    rcsb_accession_info {{ deposit_date }} 
    rcsb_entry_container_identifiers {{ entry_id }} 
    rcsb_entry_info {{ deposited_nonpolymer_entity_instance_count deposited_polymer_entity_instance_count 
    resolution_combined structure_determination_methodology }} 
    rcsb_primary_citation {{ pdbx_database_id_DOI }} 
    polymer_entities {{ entity_poly {{ pdbx_seq_one_letter_code_can rcsb_entity_polymer_type rcsb_sample_sequence_length rcsb_mutation_count}} 
    rcsb_polymer_entity {{ pdbx_number_of_molecules formula_weight pdbx_description pdbx_mutation rcsb_enzyme_class_combined {{ ec }} }}             
    rcsb_entity_source_organism {{ ncbi_scientific_name ncbi_taxonomy_id rcsb_gene_name {{ value }} }} }} 
    assemblies {{ pdbx_struct_assembly {{ oligomeric_count }} }} 
    refine {{ ls_R_factor_R_free }} 
    struct_keywords {{ pdbx_keywords }} 
    }} }}
        """

    # Create the query URL by appending the query to the RCSB PDB API base URL.
    query_url = f"https://data.rcsb.org/graphql?query={query}"
    
    # Send the GET request to the RCSB PDB API and retrieve the response.
    response = requests.get(query_url)
    
    # Raise an exception if the request failed.
    response.raise_for_status()
    
    # Parse the JSON response and store it in the 'info' variable.
    info = response.json()
    
    return info

Each structure’s metadata is returned as a dictionary, i.e. the pdb_infos resembles a list of dictionaries. 

In [None]:
# Iterate over the PDB IDs in the 'pdb_id' list and call the 'retrieve_pdb_info' function to get further info on each structure 
# Exploiting the tqdm module to display a progress bar
pdb_infos = [retrieve_pdb_info(pdb_id) for pdb_id in tqdm(pdb_ids, desc="Retrieve PDB Infos")]

In [None]:
def replace_none_values(obj):
    """
    Replaces None values in a nested object with 'NaN' strings.

    Parameters:
        obj (dict or list): The nested object to be processed.

    Returns:
        None. The input object is modified in-place.
    """
    # If the input object is a dictionary, iterate over every key in the dictionary
    if isinstance(obj, dict):
        for key in obj:
            # If the value corresponding to the current key is None, replace with the 'NaN' string
            if obj[key] is None:
                obj[key] = 'NaN'
            # If the value is a nested dictionary or list, recursively call the function.
            elif isinstance(obj[key], (dict, list)):
                replace_none_values(obj[key])

    # If the input object is a list, iterate over each item in the list and recursively call the function 
    elif isinstance(obj, list):
        for item in obj:
            replace_none_values(item)

In [None]:
def save_info_to_file(filename, retrieve_info, pdb_ids):
    """
    Save the received information on each structure to a file.

    Parameters:
        filename (str):             The name of the file to save the information to.
        retrieve_info (function):   A function that retrieves the information for a given PDB ID.
        pdb_ids (list):             A list of PDB IDs to retrieve the information for.
    """
 
    with open(filename, "w") as output_file:

        # Iterate over every PDB ID in the list 
        for pdb_id in pdb_ids:

            # Retrieve the information for the current PDB ID
            pdb_info = retrieve_info(pdb_id)

            # Write the PDB ID and its information to the output file
            output_file.write(f"PDB ID: {pdb_id}\n")
            pprint.pprint(pdb_info, stream=output_file, width=1)

            # Add two line breaks between each element of the list
            output_file.write("\n\n")

filepath = os.path.join('Data', 'PDB')
tmp_folder = os.path.join(filepath, 'Temp')            
save_info_to_file(os.path.join(tmp_folder, 'PDB_API_StructureMetadata.txt'), retrieve_pdb_info, pdb_ids)

## Create dataframe
We next extract the metadata using the  helper functions 'extract_from_dict' and create a pandas data frame with all the relevant metadata per structure.

In [None]:
def extract_from_dict(dictionary, keyword):
    
    """
    Helper function to extract all the values associated with a keyword from a nested dictionary.
        
    Parameters: 
        dictionary (dict): The dictionary from which the values need to be extracted.
        keyword (str):     The keyword associated with the values that need to be extracted.
    
    Returns: 
        (list):            The extracted values from the dictionary. 
                           If the list comprises only one element, the element itself is returned, instead of a list.
    """
    
    # Initialize an empty list 
    results = [] 

    def recursive_search(dictionary, keyword):
        
        """
        A recursive helper function that searches for the keyword in the nested dictionary 
        and adds its associated values to the results list.
        """
        
        # Check if the current item is a dictionary
        if isinstance(dictionary, dict):
            
            # Check if the keyword is found in the dictionary
            if keyword in dictionary:
                
                # Add the value associated with the keyword to the results list
                results.append(dictionary[keyword])
                
            # Iterate over each key-value pair in the dictionary    
            for key, value in dictionary.items():
                
                # If the value is a dictionary, recursively search for the keyword in the nested dictionary
                if isinstance(value, dict):
                    recursive_search(value, keyword)
                
                # If the value is a list, iterate over each item in the list and 
                # recursively search for the keyword in the nested item
                elif isinstance(value, list):
                    for item in value:
                        recursive_search(item, keyword)

    recursive_search(dictionary, keyword)
        
    if len(results) == 0: 
        results = ['NaN']
            
    return results

In [None]:
def create_pdb_info_df(pdb_ids, pdb_infos):
    """
    Create a data frame containing PDB (Protein Data Bank) information.

    Parameters:
        pdb_ids   (List[str]):  List of PDB IDs.
        pdb_infos (List[dict]): List of dictionaries containing PDB information.

    Returns:
        pd.DataFrame containing PDB information.

    """
    info = [{
            'PDB_ID':                         extract_from_dict(pdb_info, "rcsb_id")[0],
            'Title':                          extract_from_dict(pdb_info, "title")[0], 
            'Nonpolymer_Entity_Count':        extract_from_dict(pdb_info, "deposited_nonpolymer_entity_instance_count")[0],
            'Protein_Entity_Count':           extract_from_dict(pdb_info, "deposited_polymer_entity_instance_count")[0],
            'Exp_Method':                     extract_from_dict(pdb_info, 'exptl'),
            'Cryst_Method':                   extract_from_dict(pdb_info, "exptl_crystal_grow")[0], 
            #'pH':                            extract_from_dict(pdb_info, "pH")[0],
            'Deposit_Date':                   extract_from_dict(pdb_info, "deposit_date")[0],
            'Resolution':                     extract_from_dict(pdb_info, "resolution_combined")[0],
            'DOI':                            extract_from_dict(pdb_info, "pdbx_database_id_DOI")[0],
            'Polymer_Entity_Dict':            extract_from_dict(pdb_info, "entity_poly"),
            'Source_Organism':                extract_from_dict(pdb_info, "rcsb_entity_source_organism"),
            'Polymer_Entity':                 extract_from_dict(pdb_info, "rcsb_polymer_entity"),
            'R_Free':                         extract_from_dict(pdb_info, "ls_R_factor_R_free"), 
            'Keywords':                       extract_from_dict(pdb_info, "pdbx_keywords"),
        } for pdb_info in pdb_infos]

    pdb_info_df = pd.DataFrame(info)

    return pdb_info_df

In [None]:
# Call the create_pdb_info_df function to create a pandas.data frame containing the PDB information
pdb_info_df = create_pdb_info_df(pdb_ids, pdb_infos)
pdb_info_df.head(1)

# Parse .pdb Files to retrieve further info

https://www.wwpdb.org/documentation/file-format-content/format33/remarks1.html#REMARK%20280 <br>
**REMARK280** presents information about the crystal. [...] Crystallization conditions are in free text.

https://www.wwpdb.org/documentation/file-format-content/format32/remarks2.html#REMARK%20610 <br>
**REMARK 610** enumerates non-polymer residues with missing atoms.

In [None]:
def extract_remarks_from_pdb_file(pdb_ids, remark):
    """
    Extracts remark information from PDB files for a given list of PDB IDs.
    Parameters:
        pdb_ids (list):  List of PDB IDs to extract remarks for
        remark (str):    Type of remark to extract (e.g. 'REMARK 280', 'REMARK 610')
    Returns:
        pd.DataFrame: DataFrame containing the extracted remark information
    """
     # Initialize an empty list to store the extracted data
    data = [] 
    # Iterate over the list of pdb_ids
    for pdb_id in pdb_ids:
        url = f'https://files.rcsb.org/view/{pdb_id}.pdb'  
        # Send a GET request to read in the PDB file (without saving it locally)
        response = requests.get(url)
        if response.status_code == 200:  # Check if the request was successful
            # Extract the remark lines from the PDB file, removing the remark type and any leading/trailing whitespace
            remark_lines = [line[len(remark):] for line in response.text.splitlines() if line.startswith(remark)]
            # Create a dictionary with the PDB ID and the extracted remark information, and add it to the data list
            data.append({'PDB_ID': pdb_id,  remark.replace(' ', '_'): re.sub(' +', ' ', ' '.join(remark_lines))})
        else:
            # If the request failed, add an error message to the data list
            data.append({'PDB_ID': pdb_id,  remark.replace(' ', '_'): f'Failed to retrieve {pdb_id}.pdb. Status code: {response.status_code}'})
    # Convert the data list to a Pandas DataFrame and return the latter
    return pd.DataFrame(data)  

# Merge the extracted remark information into the pdb_info_df DataFrame
pdb_info_df = pdb_info_df.merge(extract_remarks_from_pdb_file(pdb_ids, 'REMARK 280'), on='PDB_ID', how='left')
pdb_info_df = pdb_info_df.merge(extract_remarks_from_pdb_file(pdb_ids, 'REMARK 610'), on='PDB_ID', how='left')

#pdb_info_df.head(1)

# Post-Processing of the pdb_info_df data frame

In [None]:
# Remove the specified substring from the REMARK_610 column
substring = "MISSING HETEROATOM THE FOLLOWING RESIDUES HAVE MISSING ATOMS (M=MODEL NUMBER; RES=RESIDUE NAME; C=CHAIN IDENTIFIER; SSEQ=SEQUENCE NUMBER; I=INSERTION CODE): M RES C SSEQI"
pdb_info_df['REMARK_610'] = pdb_info_df['REMARK_610'].str.replace(substring, '', regex=False)

# Extract numbers following the 'PH' substring from the REMARK_280 column, based on regular expressions
# PH: matches the literal 'PH' substring
# (?:=| )?: matches an optional '=' or ' ' character (non-capturing group)
# (-?\d+(?:\.\d+)?): captures the number (including optional negative sign and decimal part)
pdb_info_df['pH'] = pdb_info_df['REMARK_280'].apply(lambda x: re.search(r'PH(?:=| )?(-?\d+(?:\.\d+)?)', x).group(1) if re.search(r'PH(?:=| )?(-?\d+(?:\.\d+)?)', x) else '')

# Drop the original columns
#pdb_info_df = pdb_info_df.drop(['REMARK_280', 'REMARK_610'], axis=1)

pdb_info_df.head(1)

In [None]:
def extract_values_from_dict_in_column(df, column_name, search_term):
    """
    Extracts values from a dictionary located in a specified column of a DataFrame that match a search term 
.
    Parameters:
        df (pandas.DataFrame): The DataFrame to extract values from.
        column_name (str):     The name of the column to search for dictionaries.
        search_term (str):     The term to search for in the dictionaries.

    Returns:
        list: A list of extracted values from the dictionaries in the column that match the search term.
    """
    # Initialize an empty list to store the extracted values
    new_list = []
    
    # For each row i in the df, extract the data from the specified column, corresponding to the search term  
    for i in range(len(df)):   
        data = df.iloc[i][column_name]   
        extracted_values = extract_values(data, search_term)
        # If extracted data is not None, append to new_list, otherwise add the 'NaN' string
        if extracted_values != None: 
            new_list.append(extracted_values)
        else: 
            new_list.append('NaN')

    # If no values were extracted at all, append 'NaN' to the list
    if len(new_list) == 0: 
        new_list.append('NaN')
        
    return new_list


def extract_values(data, search_term):
    """
    Helper function to extract values from a list or dictionary matching a search term.

    Parameters:
        data (list or dict):  The list or dictionary to extract values from.
        search_term (str):    The term to search for in the list or dictionary.

    Returns:
        list or str:          A list of extracted values from the list or dictionary that match the search term, 
                              or the string 'NaN' if the input is neither a list nor a dictionary.
    """
    
    # If the input data is a list
    if isinstance(data, list):
        
        # Iterate over each element in the list and return a new list containing the extracted values from each sub-dictionary in the list
        return [extract_values(subdict, search_term) for subdict in data]
    
    # If the input data is a dictionary
    elif isinstance(data, dict):
        
        # Extract the values corresponding to the search term
        return data.get(search_term, [])
    
    # If the input data is neither a list nor a dictionary, return the 'NaN' string 
    else:
        return 'NaN'

In [None]:
# Flatten the entries in the specified column from a list comprising a single dictionary to the dictionary itself
db_info_df = pdb_info_df.explode(['R_Free'], ignore_index=True)

# Call the extract_values_from_dict_in_column function to extract the values
pdb_info_df['Exp_Method'] = extract_values_from_dict_in_column(pdb_info_df, 'Exp_Method', 'method')

# Flatten the list of lists in the Exp_Method column to a simple list 
pdb_info_df['Exp_Method'] = pdb_info_df['Exp_Method'].apply(lambda x: [item for sublist in x for item in sublist])

# If there is only one value in the Exp_Method list, extract the value, else return the original list
pdb_info_df['Exp_Method'] = [(x[0]) if len(x) == 1 else x for x in pdb_info_df['Exp_Method']]

print(pdb_info_df['Exp_Method'].value_counts())

# Explode to separate for each data collection method, e.g. in case of joint XRD/ND
pdb_info_df = pdb_info_df.explode(['R_Free', 'Exp_Method', 'Resolution'], ignore_index=True)

# Drop the rows for all structures that were determined by other methods than X-Ray crystallography
pdb_info_df = pdb_info_df[pdb_info_df['Exp_Method'].str.contains('X-RAY DIFFRACTION', case=False)]
pdb_info_df = pdb_info_df.reset_index(drop=True)

# pdb_info_df.head(1)

In [None]:
# Call the extract_values_from_dict_in_column function to extract the values into two helper columns
pdb_info_df['Cryst_Method_Method']  = extract_values_from_dict_in_column(pdb_info_df, 'Cryst_Method', 'method')
pdb_info_df['Cryst_Method_Details'] = extract_values_from_dict_in_column(pdb_info_df, 'Cryst_Method', 'pdbx_details')

for col in ['Cryst_Method_Method', 'Cryst_Method_Details']:
    # If there is only one value in the list, extract the value, else return the original list
    pdb_info_df[col] = [(x[0]) if len(x) == 1 and type(x) is list else x for x in pdb_info_df[col]]

    # Iterate through each entry in the column and replace the None and 'None' values with 'NaN'. 
    # Else, keep the original value.
    pdb_info_df[col] = pdb_info_df[col].apply(lambda x: 'NaN' if x in [None, 'NaN', 'None'] or type(x) is list and len(x) == 1 and (x[0] is None or x[0] == 'NaN' or x[0] == 'None') else x)

    # Transform the entries to strings
    pdb_info_df[col] =  pdb_info_df[col].apply(lambda x: ', '.join(map(str, x)) if type(x) is list else x)

# Join the strings from the 'Cryst_Method_Details' and 'Cryst_Method_Method' column together
pdb_info_df['Cryst_Method'] = pdb_info_df['Cryst_Method_Method'].str.cat(pdb_info_df['Cryst_Method_Details'], sep='; ')

# Drop the helper columns 'Cryst_Method_Method' and 'Cryst_Method_Details' from the df
pdb_info_df = pdb_info_df.drop(['Cryst_Method_Method', 'Cryst_Method_Details'], axis=1)

pdb_info_df.head(1)

**Crystallization technique (co-crystallized vs. soaked)** <br>
First, search title, Remark 280 and crystallization fields for keywords "soak" or "cocrystal" and variations. <br>
Next, for those structures with no annotation found and a doi present, manually check the original publications and add the info to the dataframe. 

In [None]:
def soaking_or_cocrystallization(x):
    # Check if the string 'soak' is present in the input string
    if 'soak' in x.lower():
        # If yes, return 'soak'
        return 'soaked'
    # Check, if string 'cocrystal' or 'co-crystal' is present in the input string
    elif any(var in x.lower() for var in ['cocrystal', 'co-crystal']):
        # If either 'cocrystal' or 'co-crystal' is present, return 'co-crystallize'
        return 'co-crystallized'
    # If neither 'soak' nor 'cocrystal'/'co-crystal' is present, return 'NaN'
    else:
        return 'NaN' 

# Apply the soaking_or_cocrystallization function to each row of the pdb_info_df DataFrame
pdb_info_df['Cryst_Technique'] = pdb_info_df.apply(lambda x: 
                                                # For each row, apply the soaking_or_cocrystallization function to 3 columns: 'Cryst_Method', 'Title', and 'REMARK_280'
                                                next((val for val in [
                                                                soaking_or_cocrystallization(x['Cryst_Method']), 
                                                                soaking_or_cocrystallization(x['Title']), 
                                                                soaking_or_cocrystallization(x['REMARK_280'])
                                                                ] if not pd.isna(val)), 'NaN'), axis=1)

# Print the value counts of the 'Cryst_Method' column
print(pdb_info_df['Cryst_Technique'].value_counts())

In [None]:
# Lists origin from manually checking the original publications
cocrystallized = [ "1CDK", "1CMK", "2GNF", "2GNH", "2GNI", "2GNJ", "2GNL", "2QUR", "2UZT", "2UZU", "2UZV", "2UZW", "3AG9", "3AGL", "3BWJ", "3FJQ", "3L9L", "3L9M", "3L9N", "3MVJ", "3QAL", "3QAM", "3VQH", "4C34", "4DFX", "4DFZ", "4DG0", "4DG2", "4DH1", "4DH3", "4DH5", "4DH7", "4DH8", "4HPT", "4HPU", "4IAC", "4IAD", "4IAF", "4IAI", "4IAK", "4IAY", "4IAZ", "4IB0", "4IB1", "4IB3", "4O21", "4UJ1", "4UJ2", "4UJ9", "4UJA", "4UJB", "4WB5", "4WB6", "4WB8", "4XW4", "4XW5", "4XW6", "4Z83", "4Z84", "5IZF", "5IZJ", "5J5X", "5NW8", "5O0E", "5O5M", "5OK3", "5OL3", "5OT3", "6C0U", "6E21", "6E99", "6E9L", "6EH2", "6EM6", "6EM7", "6EMA", "6ERT", "6ERW", "6ESA", "6I2A", "6I2B", "6I2C", "6I2D", "7PID", "7PIE", "7PIF", "7PIG", "7PIH", "7PNS", "7UJX", "7V0G", "7Y1G", "8SF8" ]
soaked = [ "2JDS", "2JDT", "2JDV", "2OH0", "2OJF", "2UW3", "2UW4", "2UW5", "2UW8", "2VNW", "2VNY", "2VO0", "2VO3", "2VO6", "2VO7", "3AMA", "3AMB", "3DND", "3DNE", "3NX8", "3X2U", "3X2V", "3X2W", "3ZO1", "3ZO2", "3ZO3", "3ZO4", "4AXA", "4C35", "4C36", "4C37", "4C38", "4IE9", "4IJ9", "4NTT", "4YXR", "4YXS", "5N33", "5N3C", "5N3D", "5N3E", "5N3H", "5N3J", "5N3Q", "5N3S", "6EM2", "6SNN", "6SNX", "6SOX", "6SPM", "6SPS", "6SPU", "6SPY", "6YPS", "6ZN0" ]

# Replace NaN values in Cryst_Technique column
pdb_info_df.loc[pdb_info_df['PDB_ID'].isin(soaked)        , 'Cryst_Technique'] = 'soaked'
pdb_info_df.loc[pdb_info_df['PDB_ID'].isin(cocrystallized), 'Cryst_Technique'] = 'co-crystallized'

# Print the value counts of the 'Cryst_Technique' column
print(pdb_info_df['Cryst_Technique'].value_counts())

In [None]:
# Keep the first 10 characters that correspond to the date (neglecting the time)
pdb_info_df['Deposit_Date'] = pdb_info_df['Deposit_Date'].str[:10]

# Overwrite the 'DOI' column so that it contains doi links instead of simple strings, except for 'NaN' values
pdb_info_df['DOI'] = pdb_info_df['DOI'].apply(lambda x: f'https://doi.org/{x}' if x not in [None, 'NaN', 'None'] else 'NaN')

# If there is only one value in the resolution list, extract the value and convert to float, else return the original list
pdb_info_df['Resolution'] = [float(x[0]) if isinstance(x, list) and len(x) == 1 else x for x in pdb_info_df['Resolution']]

# Display data frame
pdb_info_df.head(1)

In [None]:
# Flatten the entries in the specified columns from a list comprising a single dictionary to the dictionary itself
pdb_info_df = pdb_info_df.explode(['Polymer_Entity_Dict', 'Polymer_Entity', 'Source_Organism'], ignore_index=True)

# Another round of flattening is required for the column specified
pdb_info_df = pdb_info_df.explode(['Source_Organism'], ignore_index=True)

In [None]:
# Call the extract_values_from_dict_in_column function to extract the values into individual columns
pdb_info_df['Sequence']         = extract_values_from_dict_in_column(pdb_info_df, 'Polymer_Entity_Dict',  'pdbx_seq_one_letter_code_can')
pdb_info_df['Sequence_Length']  = extract_values_from_dict_in_column(pdb_info_df, 'Polymer_Entity_Dict',  'rcsb_sample_sequence_length')
pdb_info_df['Mutation_Count']   = extract_values_from_dict_in_column(pdb_info_df, 'Polymer_Entity_Dict',  'rcsb_mutation_count')
pdb_info_df.drop('Polymer_Entity_Dict', axis=1, inplace=True)

pdb_info_df['Gene_Name']        = extract_values_from_dict_in_column(pdb_info_df, 'Source_Organism', 'rcsb_gene_name')
pdb_info_df['Source_Organism']  = extract_values_from_dict_in_column(pdb_info_df, 'Source_Organism', 'ncbi_scientific_name')

pdb_info_df['MW']               = extract_values_from_dict_in_column(pdb_info_df, 'Polymer_Entity',  'formula_weight')
pdb_info_df['Name']             = extract_values_from_dict_in_column(pdb_info_df, 'Polymer_Entity',  'pdbx_description')
pdb_info_df['EC']               = extract_values_from_dict_in_column(pdb_info_df, 'Polymer_Entity',  'rcsb_enzyme_class_combined')
pdb_info_df['Mutations']        = extract_values_from_dict_in_column(pdb_info_df, 'Polymer_Entity',  'pdbx_mutation')
pdb_info_df.drop('Polymer_Entity', axis=1, inplace=True)

pdb_info_df.head(2)

In [None]:
# Flatten the entries in the specified column from a list comprising a single dictionary to the dictionary itself
pdb_info_df = pdb_info_df.explode('EC', ignore_index=True)

# Call the extract_values_from_dict_in_column function to extract the values into individual columns
pdb_info_df['EC_Number'] = extract_values_from_dict_in_column(pdb_info_df, 'EC', 'ec')

# Drop the original column
pdb_info_df.drop('EC', axis=1, inplace=True)

pdb_info_df.head(2)

In [None]:
# Flatten the entries in the specified column from a list comprising a single dictionary to the dictionary itself
pdb_info_df = pdb_info_df.explode('Gene_Name', ignore_index=True)

# Call the extract_values_from_dict_in_column function to extract the values into individual columns
pdb_info_df['Gene_Name'] = extract_values_from_dict_in_column(pdb_info_df, 'Gene_Name', 'value')

# Convert all strings in the Gene_Name column to uppercase strings
pdb_info_df['Gene_Name'] = pdb_info_df['Gene_Name'].str.upper()

# Replacing 'PRKACA' by 'PKACA' in the 'Gene_Name' column, as these are synonyms
pdb_info_df['Gene_Name'] = pdb_info_df['Gene_Name'].str.replace('PRKACA', 'PKACA')
# Replacing the duplicate 'PKACA PKACA' by the single 'PKACA' in the 'Gene_Name' column
pdb_info_df['Gene_Name'] = pdb_info_df['Gene_Name'].str.replace('PKACA PKACA', 'PKACA')

In [None]:
# Flatten the entries in the specified column from a list comprising a single dictionary to the dictionary itself
pdb_info_df = pdb_info_df.explode('Keywords', ignore_index=True)

# String Manipulation: First letter uppercase, all others lower case, for the specified columns
# Exceptions are the words 'NaN' and 'pH'
cols_string_manipulation = ['Name', 'Cryst_Method', 'REMARK_280', 'Keywords', 'Exp_Method' ]
pdb_info_df[cols_string_manipulation] = pdb_info_df[cols_string_manipulation].map(lambda x: x.title() if isinstance(x, str) else x)
pdb_info_df[cols_string_manipulation] = pdb_info_df[cols_string_manipulation].map(lambda x: x.replace('Nan', 'NaN').replace('Ph', 'pH').replace('Camp', 'cAMP') if isinstance(x, str) else x)

In [None]:
# Replace any empty cells with 'NaN'
pdb_info_df = pdb_info_df.replace({'': 'NaN', pd.NA: 'NaN', 'Nan': 'NaN', 'NAN': 'NaN'})

# print(len(pdb_info_df))

# # Drop rows where all values in the specified columns are NaN (missing values)
# pdb_info_df = pdb_info_df.dropna(
#     subset=['Sequence', 'Sequence_Length', 'Mutation_Count', 'MW', 'Name', 'EC_Number', 'Gene_Name', 'Source_Organism'], 
#     how='all')

# # Reset index
# pdb_info_df.reset_index(drop=True, inplace=True)

print(len(pdb_info_df))

In [None]:
# If any entry in the data frame is a list, convert that list to a string by joining the entries together
def string_join_list(val):
    if isinstance(val, list):
        return ', '.join(map(str, val))
    else:
        return val
    
# Use the pandas 'map' function to apply the conversion function to every cell in the data frame
pdb_info_df = pdb_info_df.map(string_join_list)

pdb_info_df.head(1)

In [None]:
# Drop duplicate rows in the data frame, and keep only the first! 
pdb_info_df = pdb_info_df.drop_duplicates(keep='first')

print(len(pdb_info_df))
pdb_info_df.head(2)

### Cross Check: Which EC-Numbers are present?

In [None]:
ec_numbers_present = pdb_info_df.EC_Number.unique().tolist()
print(ec_numbers_present)

'2.7.11.11' resembles Protein Kinase A

'2.7.1.37' is for protein kinases in general. 
Outdated - Dual labelling was only used for structures resolved before 2007!

In [None]:
# Show the rows, for which no EC-Number is contained 
# => e.g. for the peptidic inhibitor present in nearly every structure
pdb_info_df_no_EC = pdb_info_df[pdb_info_df['EC_Number'] == 'NaN'] 
pdb_info_df_no_EC = pdb_info_df_no_EC.sort_values('MW', ascending=False)
print(len(pdb_info_df_no_EC))
#pdb_info_df_no_EC.head(25)

In [None]:
# Find a row in the data frame based on the entry in the PDB_ID column
# pdb_info_df[pdb_info_df['PDB_ID'] == '6ESA'] 

## Write beautified data frame to file

In [None]:
pdb_info_df.to_excel(os.path.join(tmp_folder, f"PDB_API_StructureMetadata.xlsx"), index=False)
pdb_info_df.to_csv(os.path.join(tmp_folder, f"PDB_API_StructureMetadata.csv"))

# Exclude structures, not comprising any non-polymer entity 

In [None]:
print(len(pdb_info_df))

# Exclude structures, not comprising any non-polymer entity
pdb_info_df = pdb_info_df[pdb_info_df['Nonpolymer_Entity_Count'] != 0]
pdb_info_df = pdb_info_df.reset_index(drop=True)

print(len(pdb_info_df))

# Filtering based on Gene Names 

In [None]:
genenames = sorted(list(pdb_info_df.Gene_Name.unique()))

print(genenames)

| Gene Name     | Encoded protein                                                                | Protein Type       |
|---------------|--------------------------------------------------------------------------------|--------------------|
| PKA           | too generic, check the corresponding structures!                               | (generic)          |
| PKACA         | Protein Kinase A, *catalytic* subunit alpha                                    | catalytic subunit! |
| PKIA          | cAMP-dependent protein kinase *inhibitor* alpha                                | Peptidic Inhibitor |
| PKIA PRKACN1  | cAMP-dependent protein kinase *inhibitor* alpha                                | Peptidic Inhibitor |
| PLN           | *Phospholamban*, a calcium transporter protein (1.7 kDa)                       | Substrate of PKA   |
| PRKACN1       | cAMP-dependent protein kinase *inhibitor* alpha                                | Peptidic Inhibitor |
| PRKAR1A       | cAMP-dependent protein kinase type I-alpha *regulatory* subunit                | Regulatory Subunit |
| PRKAR1B       | cAMP-dependent protein kinase type I-beta *regulatory* subunit                 | Regulatory Subunit |
| PRKAR2B       | cAMP-dependent protein kinase type II-beta *regulatory* subunit                | Regulatory Subunit |
| RYR2          | *Ryanodine Receptor* 2 (24.3 kDa!)                                             | Substrate of PKA   |

=> Exclude structures, that correspond to/comprise the regulatory subunit of PKA or represent complexes with protein substrates!

In [None]:
# Define a list of of gene names, that will be kept
genenames_to_keep = ['PKA', 'PKACA', 'NaN', 'PKIA', 'PRKACN1', 'PKIA PRKACN1']

# Assign all other gene names to the genenames_to_drop list
# (we are not interested in the latter, as they resemble the regulatory subunit or protein substrates) 
genenames_to_drop = [i for i in genenames if i not in genenames_to_keep]

print('Gene Names we are not interested in, as they resemble the regulatory subunit or substrate proteins of PKA =')
print(genenames_to_drop)

In [None]:
# As the gene name 'PKA' is to generic, we check which structures are annotated with the latter 
pdb_info_df_PKAgenename = pdb_info_df[pdb_info_df['Gene_Name'] == 'PKA'] 
# Show all rows for these PDB entries
pdb_info_df[pdb_info_df['PDB_ID'].isin(pdb_info_df_PKAgenename['PDB_ID'].tolist())]

These 3 PDB-IDs (3E8C, 3E8E, 3MVJ) are all of PKA-catalytic subunit alpha structures. For all, another row with gene_name = PKACA is present in the data frame, for which all other info is the same. Therefore, we can simply kick out the entries with gene_Name = PKA, too.

In [None]:
print(len(pdb_info_df))

# Keep only rows, for which the Gene_Name column does not equal to the generic 'PKA' annotation
pdb_info_df = pdb_info_df[pdb_info_df['Gene_Name'] != 'PKA']
pdb_info_df = pdb_info_df.reset_index(drop=True)

print(len(pdb_info_df))

## Exclude structures, that are/comprise the regulatory subunit or are substrates of PKA based on gene names
Has to be performed on the PDB IDs, as there might be multiple rows per ID and they might differ on the gene name, e.g. if a peptidic inhibitor is bound. Hence, first extract the PDB IDs to be excluded based on gene name, then exclude all entries with the given PDB IDs.

In [None]:
# Create a new column with a boolean values, that indicates whether the Gene_Name is in the genenames_to_drop list or not 
pdb_info_df['Gene_Name_Bool'] = pdb_info_df['Gene_Name'].isin(genenames_to_drop)

# Query the data frame for rows where the Gene_Name_Bool column is True
# and write the corresponding PDB-IDs to the pdb_ids_to_drop list
pdb_ids_to_drop = sorted(list(set(pdb_info_df.query("Gene_Name_Bool == True")['PDB_ID'])))
print(f'{len(pdb_ids_to_drop)} PDB-IDs are not considered as a consequence of the gene name filtering = \n {pdb_ids_to_drop} \n')

# Drop the helper column Gene_Name_Bool from the data frame again
pdb_info_df.drop('Gene_Name_Bool', axis=1, inplace=True)

**Manual Cross-Check via PDB GUI**

PDB-ID | Reason for exclusion | 
| --- | --- |
2QCS | contains a regulatory subunit | 
3FHI | contains a regulatory subunit | 
3IDB | contains a regulatory subunit | 
3IDC | contains a regulatory subunit | 
3O7L | in complex with phospholamban-derived peptide | 
3PVB | contains a regulatory subunit | 
4DIN | contains a regulatory subunit | 
4WBB | contains a regulatory subunit | 
4X6R | contains a regulatory subunit | 
5JR7 | contains a regulatory subunit | 
6MM6 | in complex with RyR2 | 
6MM7 | in complex with RyR2 | 
6MM8 | in complex with RyR2 | 
6NO7 | contains a regulatory subunit | 

In [None]:
# Now finally filter the pdb_info_df to all rows, for which the PDB-ID *is not* in the pdb_ids_to_drop list 
pdb_info_df = pdb_info_df[~pdb_info_df['PDB_ID'].isin(pdb_ids_to_drop)]
pdb_info_df = pdb_info_df.reset_index(drop=True)

# Cross-Check, which gene names are included after the gene name filtering
print(f'Gene Names after Filtering = \n {list(pdb_info_df.Gene_Name.unique())} \n ')

print(f'Length of the data frame after Gene Name filtering = {len(pdb_info_df)}')

In [None]:
# row = pdb_info_df[pdb_info_df['Gene_Name'] == 'NaN'] 
# row

## Structures that also comprise another polymer entity (besides PKA)
Add a column named 'Multiple_Polymer_Entities' to the pdb_info_df, <br> whose value is 'True' when a peptidic inhibitor is present, else 'False'

In [None]:
# Initialize a new column 'Peptidic_Inhibitor' and set it to False by default
pdb_info_df['Peptidic_Inhibitor'] = False

# Iterate over pdb_info_df, that was grouped by PDB-ID and
# Update 'Peptidic_Inhibitor' to True if a PDB IDs that occur multiple times, i.e. if another polymer entity than PKACA is present. 
for pdb_id, group in pdb_info_df.groupby('PDB_ID'):
    if len(group) > 1:
        pdb_info_df.loc[pdb_info_df['PDB_ID'] == pdb_id, 'Peptidic_Inhibitor'] = True

# Write data frame to file
pdb_info_df.to_html(os.path.join(tmp_folder, f'PDB_PKA_pdb_info_after_genename_filtering.html'), header=True, index=False)

## Extract only the rows for the PKACA entity

In [None]:
# Create a data frame, that only contains the rows for the PKA Protein and not for the other polymer entities
pka_df = pdb_info_df[pdb_info_df['EC_Number'] == '2.7.11.11'] 
pka_df = pka_df.reset_index(drop=True)

print(len(pka_df))
pka_df.head(2)

### Cross-Check: Does any PDB ID still occur multiple times?

In [None]:
# Return a new DataFrame that only includes the rows where the PDB_ID column has a duplicate value 
# without altering the original pka_df DataFrame.
pka_df[pka_df['PDB_ID'].duplicated(keep=False)]

In [None]:
# Write data frame to file
pka_df.to_excel(os.path.join(tmp_folder, f'PDB_PKA_after_genename_filtering.xlsx'), header=True, index=False)

### PDB-IDs (after gene name filtering, only ligand-bound structures)

In [None]:
# Write the PDB-IDs (after gene name filtering and exclusion of structure not comprising any non-polymer entity) 
# to a list named pdb_ids_pka
pdb_ids_pka = list(pka_df.PDB_ID.unique())
print(f'{len(pdb_ids_pka)} PDB-IDs after Gene Name and filtering exclusion of structure not comprising any non-polymer entity = \n {pdb_ids_pka} \n ')

# Save the pdb_ids_pka list to file
with open(os.path.join(tmp_folder, f'PDB_PKA_pdb_ids_after_genename_filtering.txt'), 'w') as f:
    f.writelines("%s\n" % pdb_id for pdb_id in pdb_ids_pka)

In [None]:
# pka_df.head()

# Retrieve the Ligands

Now, get the ligand information.

In [None]:
def retrieve_pdb_ligand_info(pdb_id):
    """
    Fetch non-polymer data from rcsb.org.
    
    Parameters: 
        pdb_id (str): The PDB ID of the structure to be queried.

    Returns: 
        info (dict):  A dictionary containing information about the PDB structure.
    """
    
    # Define the GraphQL query, requesting various information about the ligand(s), as a multiline string.
    query = (
       """{ entry(entry_id: "%s") {     
    rcsb_id
    nonpolymer_entities {
      rcsb_nonpolymer_entity_container_identifiers {nonpolymer_comp_id rcsb_id}
      nonpolymer_comp {
        chem_comp {formula formula_weight id name}
        rcsb_chem_comp_descriptor { SMILES SMILES_stereo }
        rcsb_chem_comp_related { resource_accession_code resource_name } }
      rcsb_nonpolymer_entity_annotation {type}
      rcsb_nonpolymer_entity_container_identifiers {entity_id rcsb_id}
      nonpolymer_entity_instances {
        rcsb_nonpolymer_entity_instance_container_identifiers {asym_id auth_asym_id auth_seq_id entity_id} 
        rcsb_nonpolymer_instance_validation_score {
                    RSCC
                    RSR
                    alt_id
                    completeness
                    intermolecular_clashes
                    is_best_instance
                    mogul_angle_outliers
                    mogul_angles_RMSZ
                    mogul_bond_outliers
                    mogul_bonds_RMSZ
                    ranking_model_fit
                    ranking_model_geometry
                    score_model_fit
                    score_model_geometry
                    stereo_outliers
                    average_occupancy
                    type
                    is_subject_of_investigation
                    is_subject_of_investigation_provenance}
                    } } } }    
        """
        % pdb_id
    )

    # Create the query URL by appending the query to the RCSB PDB API base URL.
    query_url = f"https://data.rcsb.org/graphql?query={query}"
    
    # Send the GET request to the RCSB PDB API and retrieve the response.
    response = requests.get(query_url)
    
    # Raise an exception if the request failed.
    response.raise_for_status()
    
    # Parse the JSON response and store it in the 'info' variable.
    info = response.json()
    
    return info

In [None]:
# Iterate over the PDB IDs in the 'pdb_ids_pka' list 
# Call the 'retrieve_pdb_ligand_info' function to get further info on each structure 
# Exploiting the tqdm module to display a progress bar
lig_infos = [retrieve_pdb_ligand_info(pdb_id_pka) for pdb_id_pka in tqdm(pdb_ids_pka, desc="Retrieve PDB Ligand Infos")]

In [None]:
save_info_to_file(filename = os.path.join(tmp_folder, f"PDB_API_LigandMetadata.txt"), 
                  retrieve_info = retrieve_pdb_ligand_info, 
                  pdb_ids = pdb_ids_pka)

## Create dataframe

In [None]:
def create_lig_info_df(pdb_ids, lig_infos):
    
    """
    Create a data frame containing ligand information from the information retrieved via PDB API.  

    Parameters:
        pdb_ids   (List[str]):  List of PDB IDs.
        lig_infos (List[dict]): List of dictionaries containing the ligand information.

    Returns:
        lig_info_df (pd.DataFrame): Dataframe containing PDB information on the ligands.

    """
    info = []
    for i, lig_info in enumerate(lig_infos):  
                        
        info.append({
            'PDB_ID':          extract_from_dict(lig_info, "rcsb_id")[0],
            'Lig_ID':          extract_from_dict(lig_info, "id"),
            'Lig_Name':        extract_from_dict(lig_info, "name"),
            'Lig_Formula':     extract_from_dict(lig_info, "formula"),
            'Lig_SMILES':      extract_from_dict(lig_info, "SMILES_stereo"), # isomeric SMILES!
            'Lig_MW':          extract_from_dict(lig_info, "formula_weight"),
            'Lig_Database_ID': extract_from_dict(lig_info, "rcsb_chem_comp_related"),
            'Lig_Entity_ID':   extract_from_dict(lig_info, "rcsb_nonpolymer_entity_container_identifiers"),
            'Lig_Quality':     extract_from_dict(lig_info, "nonpolymer_entity_instances")
        })
        
    lig_info_df = pd.DataFrame(info)

    return lig_info_df

In [None]:
# Call the create_lig_info_df function to create a pandas.data frame containing the ligand information
lig_info_df = create_lig_info_df(pdb_ids_pka, lig_infos)
print(len(lig_info_df))
lig_info_df.head(1)

## Post-processing of the lig_info_df data frame

In [None]:
# Extract the Entity ID for the ligand 
lig_info_df['Lig_Entity_ID'] = extract_values_from_dict_in_column(lig_info_df, 'Lig_Entity_ID', 'rcsb_id')

In [None]:
# Create a list of columns that require explosion, i.e. transformation of the column lists into individual rows
explosion_list = ['Lig_ID', 'Lig_Name', 'Lig_Formula', 'Lig_SMILES', 'Lig_MW' , 'Lig_Database_ID', 'Lig_Entity_ID', 'Lig_Quality'] 

# Explode the data frame based on the explosion_list, i.e. transform each element of the column lists into a row
# Thereby, ignore the index and create a new one.
lig_info_df = lig_info_df.explode(explosion_list, ignore_index=True)
lig_info_df.head(2)

In [None]:
# Convert the dictionary to a list of cross-references to other databases
# by string joining the resource_name and resource_accession_code together, using the : separator 
lig_info_df['Lig_Database_ID'] = lig_info_df['Lig_Database_ID'].apply(
    lambda x: [f"{d['resource_name']}: {d['resource_accession_code']}" for d in x] if x is not None else [])

# Now we are able to extract the IDs from other databases, such as ChEMBL or drugbank, etc. to individual columns
def extract_database_ids(df, database_name):
    # Create a new column in the dataframe with the extracted database IDs
    df[f'Lig_{database_name}_ID'] = df['Lig_Database_ID'].apply(
        lambda x: 
            str(sorted(list(set([
                # Split each item in the list x by ': ' and take the second part 
                item.split(': ')[1] for item in x 
                # Filter items that start with the database name (case-insensitive)
                if item.lower().startswith(database_name.lower() + ':') 
            ]))))
            # If x is not None and contains at least one item that starts with the database name 
            if x and any(item.lower().startswith(database_name.lower() + ':') for item in x) 
            # If x is None or empty, return a list containing 'NaN'
            else 'NaN')
    return df

# Loop through each database and extract the IDs
for database in ['ChEMBL', 'DrugBank']:
    lig_info_df = extract_database_ids(lig_info_df, database)

# Drop the helper column
lig_info_df = lig_info_df.drop(columns=['Lig_Database_ID'])

lig_info_df.head(3)

In [None]:
# Remove spaces from the strings in the Lig_Formula column
lig_info_df['Lig_Formula'] = lig_info_df['Lig_Formula'].str.replace(' ', '')

In [None]:
# lig_info_df[lig_info_df['PDB_ID'] == '5N3A'] 

In [None]:
lig_info_df.to_csv(os.path.join(tmp_folder, f'PDB_API_LigandMetadata.csv'), header=True, index=False)

## Group by "Ligands"

In [None]:
# Group the DataFrame by the 'Lig_ID' column
# Then apply the list function to the 'PDB_ID' and the Lig_Entity_ID columns
# Reset the index to get 'PDB_ID' as a column
ligands_df = lig_info_df.groupby('Lig_ID').agg({
    'PDB_ID':        list,
    'Lig_Entity_ID': list
}).reset_index()

print(f'{len(ligands_df)} different ligands have been identified initially based on Ligand ID.') 

# Merge the grouped dataframe again with the other columns of the original dataframe
ligands_df = ligands_df.merge(lig_info_df.copy().drop(['PDB_ID','Lig_Entity_ID', 'Lig_Quality'], axis=1).drop_duplicates(), 
                              on='Lig_ID', how='left')

# Sort the 'ligands_df' data frame in ascending order based on 'Lig_MW' and reset the index
ligands_df = ligands_df.sort_values(by=['Lig_MW'], ascending=True)
ligands_df = ligands_df.reset_index(drop=True) 

# Record the length of the list in the PDB_ID column in the newly created 'Occurence' column
# i.e. how often was a particular ligand co-crystallized?
ligands_df['Occurence'] = ligands_df['PDB_ID'].apply(len)

# Add molecule column 
PandasTools.AddMoleculeColumnToFrame(ligands_df, smilesCol="Lig_SMILES", molCol='Lig_Structure')

In [None]:
PandasTools.RenderImagesInAllDataFrames(True) # to overcome display error
ligands_df.head(1)

## Uncharge/Neutralize the Molecules

In [None]:
# Taken from https://github.com/chembl/ChEMBL_Structure_Pipeline/blob/master/chembl_structure_pipeline/standardizer.py#L291

def uncharge_mol(m):

    """
    Uncharge molecules by adding and/or removing hydrogens.
        - For zwitter ions, hydrogens are moved to eliminate charges where possible.
        - By default, in cases where there is a positive charge that is not neutralizable, 
          an attempt is made to also preserve the corresponding negative charge.
        - When the force option is set, all neutralizable sites are uncharged, 
          also when non-neutralizable positive charges are present and the resulting overall charge is therefore not null.
    """
    # Uncharger is an RDKit algorithm that can neutralize charges in a molecule. 
    uncharger = rdMolStandardize.Uncharger(canonicalOrder=True) #force=False

    # Apply the uncharger algorithm
    res = uncharger.uncharge(m)

    # Update the property cache of the resulting molecule to ensure accurate property calculations
    res.UpdatePropertyCache(strict=False)

    return res

In [None]:
# Apply the uncharge function
ligands_df['Lig_Structure_Uncharged'] = ligands_df['Lig_Structure'].apply(uncharge_mol)
# Convert the uncharged molecule to SMILES 
ligands_df['Lig_SMILES_Uncharged'] = ligands_df['Lig_Structure_Uncharged'].apply(Chem.MolToSmiles)
ligands_df.head(1)

In [None]:
# Write the data frame to a html file
ligands_df.to_html(os.path.join(tmp_folder, f"PDB_PKA_Ligands.html"), header=True)

## Create a subdata frame with molecules, that are "no proper ligands"

Exclude Molecules contained in the USAN Council’s list of pharmacological salts (Bento et al. 2020)

In [None]:
# Read the salts file into a pandas data frame and subsequently extract the SMILES strings into a list
salts_url = "https://raw.githubusercontent.com/chembl/ChEMBL_Structure_Pipeline/master/chembl_structure_pipeline/data/salts.smi"
salts_df = pd.read_csv(salts_url, sep="\t", names=["Name","SMILES"], header=None)

# Fix erroneous SMILES! 
salts_df.loc[salts_df['Name'] == 'Orotic acid', 'SMILES'] = 'C1=C(NC(=O)NC1=O)C(=O)O'
salts_df.loc[salts_df['Name'] == 'Nitrate', 'SMILES']     = '[N+](=O)([O-])[O-] '

PandasTools.AddMoleculeColumnToFrame(salts_df, smilesCol="SMILES", molCol='Structure')

# Uncharge the salts (the same way as done before for the PDB extracted ligands)
salts_df['Structure_Uncharged'] = salts_df['Structure'].apply(uncharge_mol)
salts_df['SMILES_Uncharged'] = salts_df['Structure_Uncharged'].apply(Chem.MolToSmiles)

# Compute molecular weight of salts and sort the dataframe by the latter in ascending order
salts_df['MW'] = salts_df['Structure'].apply(Descriptors.ExactMolWt)
salts_df = salts_df.sort_values(by=['MW'], ascending=True, na_position='first', ignore_index=True)

# Manually remove 'benzoate' from the salts_df, as it was found to be a ligand for PKA 
# (PDB-ID: 6SNN, published in https://doi.org/10.1002/anie.202011295 (Oebbeke et al 2020), hinge binder!)
salts_df = salts_df[salts_df.Name != 'Benzoate']

# Save to files 
salts_df.to_html(os.path.join(tmp_folder,f"Salts.html"), header=True)
salts_df.to_html(os.path.join(tmp_folder, f"Salts.csv"), header=True)
print(len(salts_df))

# Save SMILES to list
salts_smiles_uncharged = salts_df["SMILES_Uncharged"].tolist()

In [None]:
# Create a new dataframe ligands_kicked_df for the molecules, that are "no proper ligands"
# based on the  USAN Council’s list of pharmacological salts and manually identified molecules (grouped by molecule type)

ligands_kicked_df = ligands_df[
    # All molecules that posses only a one or two letter molecular formula = ions 
    (ligands_df['Lig_Formula'].map(len) < 3) |
    # Pharmacological salts as defined by the USAN Council
    (ligands_df.Lig_SMILES_Uncharged.isin(salts_smiles_uncharged)) |
    # Molecules, identified by manual inspection
    (ligands_df.Lig_ID.isin([
        # glycols / diols
        'EDO', 'BUD', 'PEG', 'MRD', 'MPD', '1PE', 'PGE', 'PG4', 'PG5', 'PG6',
        # sugars
        'RIP',   
        # fatty acids, lipids, lipid-analogs and detergents                                  
        'MYR', 'MG8', 'ZEU',
        # solvents and biochemical buffers 
        'MOH', 'DMS', 'MES',
        # amino acids 
        'GLY', 'SER', 'PRO', 'THR', 'DAR',
        ])) 
    ]

# Sort the ligands_kicked_df by molecular weight and drop duplicates                                       
ligands_kicked_df = ligands_kicked_df.sort_values(by=['Lig_MW'], ascending=True, na_position='first', ignore_index=True)
ligands_kicked_df = ligands_kicked_df.drop_duplicates(subset=['Lig_ID'], keep='first')

print(f"{len(ligands_kicked_df)} Molecules will be disregarded, as they are 'no proper ligands'.")

ligands_kicked_df.to_html(os.path.join(tmp_folder, f'PDB_PKA_Ligands_Kicked.html'), header=True)

ligands_kicked_df

## Filter the list of ligands, based on the ligand defintion

In [None]:
# Filter the  ligands_df based on the ligands_kicked_df
# i.e. keep only the ligands that are not contained in the ligands_kicked_df
ligands_df_filtered = ligands_df[~ligands_df['Lig_ID'].isin(ligands_kicked_df['Lig_ID'])]
ligands_df_filtered = ligands_df_filtered.drop_duplicates(subset=['Lig_ID'], keep='first') 
ligands_df_filtered = ligands_df_filtered.reset_index(drop=True)

print(f'After filtering, {len(ligands_df_filtered)} different ligands have been identified.')

ligands_df_filtered.head(2)

In [None]:
# Write to files 
ligands_df_filtered.to_html(os.path.join(filepath, f'PDB_PKA_Ligands_Filtered.html'), header=True, index=False) # FINAL FILE
ligands_df_filtered.copy().drop(['Lig_Structure', 'Lig_Structure_Uncharged'], axis=1).to_excel(os.path.join(filepath, f'PDB_PKA_Ligands_Filtered.xlsx'), header=True, index=False) # FINAL FILE

### Cross-Checking: Check for structures, comprising more than one ligand

In [None]:
# Explode the 'PDB_ID' column to create a separate row for each PDB ID
# (before the data was structured per ligand)
ligands_df_filtered_exploded = ligands_df_filtered.explode(column=['PDB_ID', 'Lig_Entity_ID'])

# Sort the dataframe by the 'PDB_ID' column and reset the index
ligands_df_filtered_exploded = ligands_df_filtered_exploded.sort_values(by='PDB_ID').reset_index(drop=True)

# Count the occurrence of each PDB_ID in the data frame
pdb_id_counts = ligands_df_filtered_exploded['PDB_ID'].value_counts()

# Get the PDB_IDs that have a count greater than 1, i.e. for which multiple 'ligands' are present
multilig_pdb_ids = pdb_id_counts[pdb_id_counts > 1].index

if len(multilig_pdb_ids) == 0: 
    print('After filtering, no PDB structure comprises more than 1 ligand.')
    multilig_df = None
    
else: 
    print('Please note that some structures comprise multiple ligands matching our ligand definition. \n',
          'You may wish to revise the "ligand definition" employed, before you continue.')
    # Select rows in the data frame where the PDB_ID is not unique, i.e. for which multiple 'ligands' are present
    multilig_df = ligands_df_filtered_exploded[ligands_df_filtered_exploded['PDB_ID'].isin(multilig_pdb_ids)]
    
multilig_df

In [None]:
# Write the the PDB IDs in the dataframe to a list
pdb_ids_pka_ligands = ligands_df_filtered_exploded['PDB_ID'].to_list()

# Write to file
with open(os.path.join(tmp_folder, f'PDB_PKA_pdb_ids_after_genename_and_ligand_filtering.txt'), 'w') as f: 
    f.writelines("%s\n" % pdb_id for pdb_id in pdb_ids_pka_ligands)

print(f"The following PDB-IDs are not considered, as they do not comprise a ligand matching our definition:")
print(sorted(list(set(pdb_ids_pka).difference(set(pdb_ids_pka_ligands)))))
print(f"({(len(pdb_ids_pka) - len(pdb_ids_pka_ligands))} PDB structures)")

**Manual cross-checking (via PDB website) validates this decision.**
| PDB-ID  | "No proper Ligand(s)"                                                                | Comment |
|---------|--------------------------------------------------------------------------------------|---------|
| 1CMK    | iodide ion [IOD] and myristic   acid [MYR] = is myristoylated                        |
| 4AE6    | acetate ion [ACT]                                                                    |
| 4DFZ    | myristic acid [MYR] = is   myristoylated                                             |
| 4DG2    | myristic acid [MYR] = is   myristoylated                                             |
| 4NTS    | myristic acid  [MYR]                                                                 |
| 5IZF    | sulfate ion [SO4]                                                                    |
| 5M0U    | 4R)-2-methylpentane-2,4-diol [MRD] and methanol [MOH]                                |
| 5N3M   | dimethylsulfoxide [DMS],   (4S)-2-methylpentane-2,4-diol [MPD] and D-Arginine [DAR]  | Dissertation C. Siefker 2018 |
| 5NTJ  | beta-D-ribopyranose [RIP] and   (4S)-2-Methyl-2,4-pentanediol [MPD]                  | [Müller et al 2019](https://doi.org/10.1021/acsomega.8b02364) |
| 5OUA  | beta-D-ribopyranose [RIP]                                                            | [Müller et al 2019](https://doi.org/10.1021/acsomega.8b02364) |
| 5OUS  | (4S)-2-methylpentane-2,4-diol [MPD]                                                  | [Müller et al 2019](https://doi.org/10.1021/acsomega.8b02364) |
| 6ERV  | (4S)-2-methylpentane-2,4-diol [MPD]                                                  | [Müller et al 2019](https://doi.org/10.1021/acsomega.8b02364) |

Is D-Arginine in 5N3M a "proper ligand"? <br> 
According to the primary literature, the D-Arg binds in the same position as the Arg19 of a PKI, as the structure was obtained in a PKI-free form. Hence, we can ignore this structure safely.

In [None]:
print(len(pdb_ids_pka_ligands), 'PDB structures comprise a small molecule ligand matching our "ligand definition"')
print(f'(out of {len(pdb_ids)}   PDB-Structures that were initially identified for our Protein of Interest): \n ')

In [None]:
# Sort ascending by Lig_ID and descending by Occurence and reset the index
ligands_df_filtered_exploded = ligands_df_filtered_exploded.sort_values(by= ['Lig_ID', 'Occurence'], ascending=[True, False]).reset_index(drop=True)
ligands_df_filtered_exploded.head(1)

In [None]:
# Write the Lig IDs in the filtered data frame to a list
lig_ids = ligands_df_filtered_exploded['Lig_ID'].to_list()
# Sort them alphabetically
lig_ids = sorted(lig_ids)
print(*lig_ids)

# Save list to file
with open(os.path.join(filepath, f'PDB_PKA_ligand_ids.txt'), 'w') as f: # FINAL FILE
    f.writelines("%s\n" % lig_id for lig_id in lig_ids)

# Quality of Ligand Data

In [None]:
# Extract Ligand Quality Data, that was already contained in the lig_info_df to a new dataframe.

# Data Structure of the Lig_Quality column before: 
    # List, with one entry for each different type of ligand. 
    # Each entry possesses two dictionaries, rcsb_nonpolymer_entity_instance_container_identifiers & rcsb_nonpolymer_instance_validation_score
    # The latter stores the actual ligand quality measures, again structures as a list of dictionaries, per ligand copy.

# Explosion to create an individual row per different non-polymer instance  
# Data Structure after: one row/dictionary for each different type of ligand
lig_validation_df = lig_info_df.explode(['Lig_Quality'], ignore_index=True)

# Extract the actual ligand quality measures data
# Data structure after: list of dictionaries, with a dictionary per ligand copy
lig_validation_df['Lig_Quality'] = extract_values_from_dict_in_column(lig_validation_df, 'Lig_Quality', 'rcsb_nonpolymer_instance_validation_score')

# Explode, to create an individual row per ligand copy
# Data structure after: Simple dictionary encoding the ligand quality measures
lig_validation_df = lig_validation_df.explode(['Lig_Quality'], ignore_index=True)

# Extract the measures to individual columns, using the extract_values_from_dict_in_column function and the respective keyword
# Data structure after: simple numeric values, one row per copy of a ligand present in the PDB structure
lig_validation_df['Lig_Conformer'] =      extract_values_from_dict_in_column(lig_validation_df, 'Lig_Quality', 'alt_id')
lig_validation_df['Lig_Of_Interest'] =    extract_values_from_dict_in_column(lig_validation_df, 'Lig_Quality', 'is_subject_of_investigation')
lig_validation_df['Lig_Completeness'] =   extract_values_from_dict_in_column(lig_validation_df, 'Lig_Quality', 'completeness')
lig_validation_df['Lig_Occupancy'] =      extract_values_from_dict_in_column(lig_validation_df, 'Lig_Quality', 'average_occupancy')
lig_validation_df['Lig_RSCC'] =           extract_values_from_dict_in_column(lig_validation_df, 'Lig_Quality', 'RSCC')
lig_validation_df['Lig_RSR'] =            extract_values_from_dict_in_column(lig_validation_df, 'Lig_Quality', 'RSR')
lig_validation_df['Lig_RMSZ_Bonds'] =     extract_values_from_dict_in_column(lig_validation_df, 'Lig_Quality', 'mogul_bonds_RMSZ')
lig_validation_df['Lig_RMSZ_Angles'] =    extract_values_from_dict_in_column(lig_validation_df, 'Lig_Quality', 'mogul_angles_RMSZ')

# Drop the original column after we extracted all the measures to individual columns
lig_validation_df.drop('Lig_Quality', axis=1, inplace=True)

# Regroup the DataFrame again by the 'Lig_Entity_ID' column,   
# Thereby apply the list function to all the ligand quality related columns
# Data structure after: List of numeric values, for each ligand (list length = 1 if only one copy of that ligand bound)
lig_validation_df = lig_validation_df.groupby('Lig_Entity_ID').agg({
    'Lig_Entity_ID' : 'first', 
    **{col: list for col in lig_validation_df.columns.to_list() if col in 
    ['Lig_Conformer', 'Lig_Of_Interest', 'Lig_Completeness', 'Lig_Occupancy', 
     'Lig_RSCC', 'Lig_RSR', 'Lig_RMSZ_Bonds', 'Lig_RMSZ_Angles']}, 
    }).reset_index(drop=True)

# Add a PDB_ID column to the dataframe
lig_validation_df['PDB_ID'] = lig_validation_df['Lig_Entity_ID'].str[:4]

# Recombine the ligand validation data with the filtered ligand dataframe
# Thereby return only the rows, that have matching values in both dataframes ("proper" ligands only)
ligands_df_filtered_exploded = ligands_df_filtered_exploded.merge(lig_validation_df, on=['Lig_Entity_ID', 'PDB_ID'], how='inner')

# Store the number of alternative conformations (indicated by non-'NaN' values) in the Lig_Conformer column in new column
ligands_df_filtered_exploded['Lig_Alt_Conf'] = ligands_df_filtered_exploded['Lig_Conformer'].apply(lambda x: len([i for i in x if i != 'NaN']))

ligands_df_filtered_exploded.head(1)

In [None]:
# # Store the number of different binding sites in a new column
# # Thereby assume, that only one ligand copy can be bound per binding site and 
# # that the occupancy values of alternate conformations, binding at the site, should sum up to 1 (or less, if not fully occupied)
# # Note, that this is just a rough estimate, that should be confirmed by inspecting the structure!
# # Sum up the occupancy values for each different ligand type, and round up to the next integer value
# # If x is not a list of numeric values, return 'NaN'

# ligands_df_filtered_exploded['Lig_Sites'] = ligands_df_filtered_exploded['Lig_Occupancy'].apply(
#     lambda x: math.ceil(sum(x)) if all(isinstance(i, (int, float)) for i in x) else 'NaN')

In [None]:
# Write to files 
ligands_df_filtered_exploded.to_html( os.path.join(filepath, f'PDB_PKA_Ligands_Filtered_Exploded.html'), header=True) # FINAL FILE
ligands_df_filtered_exploded.to_excel(os.path.join(filepath, f'PDB_PKA_Ligands_Filtered_Exploded.xlsx'), header=True) # FINAL FILE

In [None]:
# ligands_df_filtered_exploded[ligands_df_filtered_exploded['PDB_ID'] == '3VQH'] 

In [None]:
# lig_validation_df[lig_validation_df['PDB_ID'] == '3VQH'] 

## Cross Check: Compare our Ligand Definition with the PDB's Ligand-Of-Interest (LOI) annotation

In [None]:
print(f"{lig_validation_df['Lig_Of_Interest'].apply(lambda x: 'Y' in x).sum()} 'Ligands Of Interest' Annotations are given in the PDB") 
print(f"(for {lig_validation_df['PDB_ID'].nunique()} ligand-bound PDB structures).")

Investigate, if we excluded any ligand with LOI annotation, based on our ligand definition.

In [None]:
# Create a new dataframe, by merging the specified columns of the lig_info_df and lig_validation_df  
loi_excluded = pd.merge(lig_info_df, 
                        lig_validation_df[['Lig_Entity_ID', 'Lig_Of_Interest']], 
                        on=['Lig_Entity_ID'], how='outer') 

# Include only the rows/ligands, that were kicked, as they do not match our ligand defintion
loi_excluded = loi_excluded[loi_excluded['Lig_ID'].isin(ligands_kicked_df['Lig_ID'])]

# Keep only rows, for which the LOI annotation equals to YES
loi_excluded = loi_excluded[loi_excluded['Lig_Of_Interest'] == 'Y']

print("\n The ligands with 'Ligand Of Interest' annotation in the PDB, that we lost in the filtering steps are: ")
print(f"{loi_excluded['Lig_ID'].unique()} (from {loi_excluded['PDB_ID'].nunique()} different PDB structures)")

# loi_excluded = loi_excluded.groupby(['Lig_ID', 'Lig_Name', 'Lig_Of_Interest', 'Lig_SMILES', 'Lig_CHEMBL_ID']).agg({'PDB_ID': list}).reset_index()
# loi_excluded

Check, if we include any ligands in our analysis, for which no LOI annotation is present in the PDB

In [None]:
# Filter the ligands_df_filtered_exploded dataframe to keep only the rows where 'Lig_Of_Interest' column value contains 'N'
ligands_without_annotation = ligands_df_filtered_exploded[ligands_df_filtered_exploded['Lig_Of_Interest'].apply(lambda x: 'N' in x)]

print(f"{len(ligands_without_annotation)} ligands are included in my definition, that do not possess the") 
print(f"PDB 'Ligand Of Interest' Annotation, namely {list(ligands_without_annotation['Lig_ID'])}")

But is/are there any 'Ligand of Interest' Annotation(s) given in these PDB structures at all?

In [None]:
# Explode all columns of the lig_validation_df, except the ID columns
lig_validation_df_exploded = lig_validation_df.explode(list(set(lig_validation_df.columns) - set(['PDB_ID', 'Lig_Entity_ID'])), ignore_index=True)
# Filter the ligands_df_filtered_exploded dataframe based on a list of PDB_IDs, whose associated structures fulfil the following 2 conditions
# 1. The PDB_ID is present in ligands_without_annotation['PDB_ID'] dataframe, i.e. 
# 2. Any of the ligands has a PDB LOI annotation ('Y') 
ligands_df_filtered_exploded[ligands_df_filtered_exploded['PDB_ID'].isin(lig_validation_df_exploded['PDB_ID'][(lig_validation_df_exploded['PDB_ID'].isin(ligands_without_annotation['PDB_ID'])) & (lig_validation_df_exploded['Lig_Of_Interest'] == 'Y')].unique())] 

# Combine structure and ligand information 

In [None]:
# Create a "master data frame" comprising all info, by merging the dataframes pka_df and ligands_df_filtered_exploded on 'PDB_ID'
# Merging method "Left" considers only the PDB_IDs, present in the ligands_df_filtered_exploded dataframe
# As the ligands_df_filtered_exploded is already limited to the ligands matching our ligand definition, so is the resulting df
df = ligands_df_filtered_exploded.merge(pka_df, on='PDB_ID', how='left')

# Convert values in Lig_ID column to strings
df['Lig_ID'] = df['Lig_ID'].astype(str)

print(len(df))

# df.head(1)

In [None]:
# Create a new column 'Lig_Missing_Atoms' to indicate if ligand atoms are missing
# by checking if the Lig_Completeness value is < 1 
df['Lig_Missing_Atoms'] = df['Lig_Completeness'].apply(lambda x: ['Yes' if i > 1 else 'No' for i in x if isinstance(i, (int, float))] if any(isinstance(i, (int, float)) for i in x) else ['NaN'])

incomplete_ligs = [pdb_id for pdb_id, lig_missing_atoms in zip(df['PDB_ID'], df['Lig_Missing_Atoms']) if 'Yes' in lig_missing_atoms]
# incomplete_ligs.sort()

print(f"According to the ligand quality measure 'Lig_Completeness', in {len(incomplete_ligs)} complex structures the ligand of interest is not fully resolved.")
print(*incomplete_ligs)

print(f"For comparison, the REMARK 610 entry reports missing atoms for {sum(row['Lig_ID'] in row['REMARK_610'] for index, row in df.iterrows())} of them.")


# Add Binding Data from the PDBBind Database
(as obtained from the 'PDBbind/PDBbind_V2020.ipynb' JupyterNotebook)

In [None]:
# Read in the beautified table containing the data from the PDBbind Database
PDBbind_path = os.path.join(filepath, 'PDBbind')
PDBbind_df = pd.read_csv(os.path.join(PDBbind_path, 'PDBbind_V2020.csv'), sep="|")

# Rename column names, except the PDB_ID column, by using upper case letters 
PDBbind_df.columns = [col if col == 'PDB_ID' else col.replace('_', '_').title() for col in PDBbind_df.columns]

# Add the prefix 'PDBbind_' to all column names
PDBbind_df.columns = ['PDBbind_' + col if col != 'PDB_ID' else col for col in PDBbind_df.columns]

# Rename the 'PDBbind_Ligand_Name' column
PDBbind_df = PDBbind_df.rename(columns={'PDBbind_Ligand_Name': 'Lig_ID'})

In [None]:
# Add the info from the PDBbind_df to the "master dataframe" df
# Merging method "Left" considers only the PDB_IDs, present in the dataframe df
df = df.merge(PDBbind_df, on=['PDB_ID', 'Lig_ID'], how='left')

binding_data_df = df[df['PDBbind_Standard_Value'].notna()]
print(f'For {len(binding_data_df)} out of {len(df)} complex structures, a binding affinity was reported for in the PDBbind database')

# Save to file

In [None]:
# df.to_html(os.path.join(filepath, 'PDB_PKA_StructuralDetails_Ligands.html'), header=True, index=False) # FINAL FILE

# Create a copy of the data frame, drop the columns ['Lig_Structure', 'Lig_Structure_Uncharged']from the copy and save to excel file
df.copy().drop(['Lig_Structure', 'Lig_Structure_Uncharged'], axis=1).to_excel(os.path.join(filepath, 'PDB_PKA_StructuralDetails_Ligands.xlsx'), header=True, index=False) # FINAL FILE

# Fetch pdb structures / files

the following code was adopted from https://www.macinchem.org/reviews/Data/PDB/downloadpdb.php and adapted for our purposes
- Authored by Chris Swain
- Copyright CC-BY

In [None]:
print(len(pdb_ids_pka_ligands))
print(*pdb_ids_pka_ligands)

# Save alphabetically-sorted list to file
with open(os.path.join(filepath, f'PDB_PKA_pdb_ids.txt'), 'w') as f: # FINAL FILE
    f.writelines("%s\n" % pdb_id for pdb_id in sorted(pdb_ids_pka_ligands))

In [None]:
# Folder to download files to
download_folder = os.path.join(filepath, 'pdb_files')

# Whether to download gzip compressed files (True) or unpacked
compressed = False

In [None]:
# Ensure download folder exists
try:
    os.makedirs(download_folder)
except OSError as e:
    # Ignore OSError raised if it already exists
    pass

In [None]:
failed_download = [] #create empty list, where pdb-codes for which the download failed will be appended subsequently
    
for pdb_id in pdb_ids_pka_ligands:
    # Add .pdb extension and remove ':1' suffix if entities
    filename = '%s.pdb' % pdb_id[:4]
    # Add .gz extension if compressed
    if compressed:
        filename = '%s.gz' % filename   
    
    destination_file = os.path.join(download_folder, filename)
    
    # Check if the file is already downloaded
    if not os.path.isfile(destination_file):
        try:
            url = 'https://files.rcsb.org/download/%s' % filename
            # Download the file
            urlretrieve(url, destination_file)
        except OSError:
            failed_download.append(f'{pdb_id}') # e.g. because there is no corresponding file (.pdb or .pdb.gz) deposited in the PDB
    
print(f'Download of {failed_download} failed.')  

# Statistics

In [None]:
df['Exp_Method'].value_counts()

In [None]:
def analyze_distribution(df, metric, unit):

    # Save numerial, non-nan values to list and sort
    values = sorted([i for i in df[metric].tolist() if isinstance(i, (float, int)) and not math.isnan(i)])

    # Print Range
    print(f'\n{metric} {unit} ranges from {values[0]} to {values[-1]} ')

    # Print Median
    print(f'{metric} median {unit} = {np.median(values)}')

    # Print PDB-ID(s) associated with lowest value
    lowest = df.loc[df[metric] == values[-1], 'PDB_ID'].values
    print(f'The lowest {metric} value is reported for PDB-ID(s): {lowest}')

    # Print PDB-ID(s) associated with highest value
    highest = df.loc[df[metric] == values[0], 'PDB_ID'].values
    print(f'The highest {metric} value is reported for PDB-ID(s): {highest}')

    # Plot the distribution, using a Raincloud Plot
    f, ax = plt.subplots(figsize=(7, 2))
    pt.RainCloud(y = values, data = df, palette = "Set2", bw = .2, width_viol = .6, ax = ax, orient = "h")
    plt.xlabel(f'{metric} {unit}')

In [None]:
analyze_distribution(df, metric='Resolution', unit='[A]')

### Binding Site
Binding sites were identified manually, using PyMOL. <br>
- The **Same_Number**, **Same_Binding_Site** and **Same_Orientation** columns compare all structures and chains with the same ligand. Consequently the boolean value is always the same for a given ligand.
    - Same_Number is True, when the same number of ligands is present in each of them
    - Same_Binding_Site is True, when 
    - Same Orientation is True, when  
- The **Multisite** column records, if more than one different binding site is occupied by the given ligand, either in the same crystal structure/chain or in different ones. Consequently the boolean value is always the same for a given ligand.
- The **Orthosteric** and **Allosteric** columns store, how many ligands of this type are present in the given PDB structure. The **Lig_Order_in_.pdb** records, in which order these are present in the .pdb file. 
- The **Occurence** column stores, how many crystal structures are present in the PDB, with the given ligand. The **PDB_IDs** column lists all of them for the given ligand, i.e. is always the same for a given ligand. 

In [None]:
# Note, that the Boolean type column values are always the same for a given ligand!

# Specify columns, by which the DataFrame should be grouped
group_cols = ['Lig_ID', 'Lig_SMILES', 
              #'Lig_Name', 'Lig_Formula', 'Lig_MW', 'Lig_CHEMBL_ID', 'Lig_SMILES_Uncharged', 
              'Multisite', 'Same_Number', 'Same_Binding_Site', 'Same_Orientation']

# All columns, not in the group_cols list, should be aggregated to lists
def agg_list(x):
    return list(x)
agg_cols = {col: agg_list for col in df.columns if col not in group_cols}  

# Conduct the actual grouping and aggregating
binding_site_df = df.groupby(group_cols).agg(agg_cols).reset_index()

# Flatten the aggregated columns
for col in agg_cols:
    # If there is only one value in the list, extract the value, else return the original list
    binding_site_df[col] = [(x[0]) if len(x) == 1 and type(x) is list else x for x in binding_site_df[col]]

# Add a the molecular structure again
PandasTools.AddMoleculeColumnToFrame(df, smilesCol="Lig_SMILES", molCol='Lig_Structure')

# Cross-Check, that we did not loose any ligand
print(f"Number of unique Lig-IDs before regrouping data = {df['Lig_ID'].nunique()}")
print(f"Number of unique Lig-IDs after regrouping data = {binding_site_df['Lig_ID'].nunique()}")

In [None]:
# Add a boolean type column to the dataframe df, which is 
# True, if the integer or the list of integers's is greater zero, and False otherwise.

def add_binding_type_boolean(x):
    if isinstance(x, list):
        return sum(i for i in x if isinstance(i, (int, float))) > 0
    elif isinstance(x, (int, float)):
        return x > 0

binding_site_df['Orthosteric_Boolean'] = binding_site_df['Orthosteric'].apply(add_binding_type_boolean)
binding_site_df['Allosteric_Boolean']  = binding_site_df['Allosteric'].apply(add_binding_type_boolean)

binding_site_df.copy().drop(['Lig_Structure'], axis=1).to_excel(
    os.path.join(tmp_folder, 'Binding_Site_Analysis.xlsx'), header=True, index=False) 

In [None]:
print(f"Out of {binding_site_df['Lig_ID'].nunique()} ligands in total,")
print(f"{binding_site_df['Orthosteric_Boolean'].sum()} bind in the ATP (orthosteric) site and")
print(f"{binding_site_df['Allosteric_Boolean'].sum()} bind at minimum one allosteric site(s).")

print(f"\n{len(binding_site_df[binding_site_df['Multisite'] == True])} ligands bind at multiple sites (either at the ATP site plus minimum one allosteric site, or at multiple allosteric sites).")

print(f"\nAllosteric Ligands are: {binding_site_df.loc[binding_site_df['Allosteric_Boolean'], 'Lig_ID'].tolist()}")