<a href="https://colab.research.google.com/github/glevans/PDBe_Notebooks/blob/main/Structure_summary_for_variant_course.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Setting-up

#@markdown <br>
#@markdown This section installs libraries and sets up Python code to extract information from PDBe-KB's APIs.
#@markdown <br>
#@markdown After running this cell, this Notebook will be able to connect PDBe-KB's APIs and summarize some information about experimental structures that can be found on PDBe-KB pages.
#@markdown <br>

#@title Mount Drive to download files
######## LIBRARIES
import requests
import json
from pprint import pprint
import pandas as pd


######## FUNCTIONS
#defining functions for search and download
def get_structure_details(UniProt_id):
    try:
        requestURL = f"https://www.ebi.ac.uk/pdbe/graph-api/uniprot/unipdb/{UniProt_id}"
        response = requests.get(requestURL)
        response.raise_for_status()  # Raises an HTTPError if the HTTP request returned an unsuccessful status code
        return json.loads(response.text)
    except requests.exceptions.HTTPError as errh:
        return f"HTTP Error: {errh}"
    except requests.exceptions.ConnectionError as errc:
        return f"Error Connecting: {errc}"
    except requests.exceptions.Timeout as errt:
        return f"Timeout Error: {errt}"
    except requests.exceptions.RequestException as err:
        return f"Oops: Something Else: {err}"

def parse_structure_data(structure_data, UniProt_id):
    structure_details = structure_data[UniProt_id]['data']
    UniProt_summary = {}

    for accession_item in structure_details:
        modified_residues = modified_list(accession_item['residues'])
        mutated_residues = mutated_list(accession_item['residues'])

        modified_residues = modified_list(accession_item['residues'])
        mutated_residues = mutated_list(accession_item['residues'])
        #print(f"Modified {modified_residues}")
        #print(f"Mutated {mutated_residues}")
        if modified_residues == None and mutated_residues == None:
            UniProt_row = {accession_item['accession'] : [None,None,None,None]}
        elif modified_residues == None:
            concatenated_list = mutated_residues
            if type(concatenated_list) == list:
              UniProt_row = {accession_item['accession'] : concatenated_list}
            else:
              UniProt_row = {accession_item['accession'] : [None,None,None,None]}
        elif mutated_residues == None:
            concatenated_list = modified_residues
            if type(concatenated_list) == list:
              UniProt_row = {accession_item['accession'] : concatenated_list}
            else:
              UniProt_row = {accession_item['accession'] : [None,None,None,None]}
        else:
            concatenated_list = zip(modified_residues,mutated_residues)
            if type(concatenated_list) == list:
              UniProt_row = {accession_item['accession'] : concatenated_list}
            else:
              UniProt_row = {accession_item['accession'] : [None,None,None,None]}
        UniProt_summary.update(UniProt_row)
    return UniProt_summary

def modified_list(residues):
    modified_list = []  # Initialize the list outside of the loop
    for residue_info in residues:
        if 'modification' in residue_info:
            modified = modification_info(residue_info)
            modified_list.append(modified)  # Append the mutation to the list
    if modified_list == []:
        return None
    else:
        #print(modified_list)
        return modified_list  # Return the list after the

def mutated_list(residues):
    mutated_list = []  # Initialize the list outside of the loop
    for residue_info in residues:
        if 'mutation' in residue_info:
            mutation = mutation_info(residue_info)
            mutated_list.append(mutation)  # Append the mutation to the list
    if mutated_list == []:
        return None
    else:
        #print(mutated_list)
        return mutated_list  # Return the list after the

def mutation_info(residue_info):
    if residue_info['startIndex'] == residue_info['endIndex']:
        residue_position = residue_info['startIndex']
        UniProt_amino_acid = residue_info['startCode']
        coordinate_amino_acid = residue_info['pdbCode']
        change = residue_info['mutationType']
        mutated_residue = [change, residue_position, UniProt_amino_acid, coordinate_amino_acid]
        return mutated_residue

def modification_info(residue_info):
    if residue_info['startIndex'] == residue_info['endIndex']:
        residue_position = int(residue_info['startIndex'])
        UniProt_amino_acid = residue_info['startCode']
        coordinate_amino_acid = residue_info['pdbCode']
        change = "Modification"
        modified_residue = [change, residue_position, UniProt_amino_acid, coordinate_amino_acid]
        return modified_residue

def summary(UniProt_id):
    structure_data = get_structure_details(UniProt_id)
    if isinstance(structure_data, dict):  # Check if the returned data is a dictionary
        UniProt_info = parse_structure_data(structure_data, UniProt_id)
        return(UniProt_info)
    else:
        print(structure_data)

def parse_general_structure_data(structure_data, UniProt_id):
    structure_details = structure_data[UniProt_id]['data']

    accession_id_list = []
    UniProt_summary = {}
    for number in range(0,len(structure_details)):
        accession_item = structure_details[number]
        accession_id = accession_item['accession']
        accession_id_list.append(accession_id)

        additionalData = accession_item['additionalData']
        # 'resolution', 'ligandCount', 'entityCount', 'experiment', 'title', 'residueCount', 'nonPolyTypes', 'unobservedRegionsPresent', 'rankingScore'
        resolution = additionalData['resolution']
        resolution = round(resolution,3)
        experiment = additionalData['experiment']

        if experiment == 'X-ray diffraction':
            concatenated_list = [experiment,resolution]
            UniProt_row = {accession_item['accession'] : concatenated_list}
        elif experiment == 'Electron Microscopy':
            concatenated_list = [experiment,resolution]
            UniProt_row = {accession_item['accession'] : concatenated_list}
        elif experiment == 'Solution NMR':
            concatenated_list = [experiment,'Not applicable']
            UniProt_row = {accession_item['accession'] : concatenated_list}
        else:
            concatenated_list = [experiment,resolution]
            UniProt_row = {accession_item['accession'] : concatenated_list}
        UniProt_summary.update(UniProt_row)
    return UniProt_summary

def general_summary(UniProt_id):
    structure_data = get_structure_details(UniProt_id)
    if isinstance(structure_data, dict):  # Check if the returned data is a dictionary
        UniProt_info = parse_general_structure_data(structure_data, UniProt_id)
        return(UniProt_info)
    else:
        print(structure_data)

#1.  Summaries for experimental structures

This section reports on the PDB ids for the experimentally-determined structures where at least one of the protein chains in the structure is associated with a UniProt ID (from PDBe-KB).

In [2]:
#@title 1.1.&nbsp;  Get all experimentally determined structures associated with a UniProt ID
#@markdown  In this sub-section you will be able to retrieve experimental structures where based on sequence and reported Taxonomy these are associated with an Uniprot accession ID.

#Run this code to display the widget
Uniprot_ID = "P68871" #@param {type:"string"}

#Run this code to preform a API call
data1 = general_summary(Uniprot_ID)

# Create a DataFrame from the dictionary
df1 = pd.DataFrame.from_dict(data1, orient="index", columns=['Experimental Method','Resolution (in Angstroms)'])

# Display the DataFrame
df1.head(10)



Unnamed: 0,Experimental Method,Resolution (in Angstroms)
7dy3,X-ray diffraction,1.4
6ka9,X-ray diffraction,1.4
6kao,X-ray diffraction,1.4
6l5v,X-ray diffraction,1.45
6l5w,X-ray diffraction,1.5
6kae,X-ray diffraction,1.45
6lcx,X-ray diffraction,1.4
6kap,X-ray diffraction,1.45
6lcw,X-ray diffraction,1.4
6kaq,X-ray diffraction,1.5


In [3]:
#@title 1.2.&nbsp;  Get summary of sequence discrepancy (e.g. variants) compared to the UniProt sequence
#@markdown In this sub-section the variants, engineered mutations, modified residues, etc for experimental structures will be reported.
#@markdown <br>
#@markdown These have been identified as positions that in conflict with the UniProt sequence.

#Run this code to display the widget
Uniprot_ID = "P68871" #@param {type:"string"}

data2 = summary(Uniprot_ID)
#pprint(data2.items())
#pprint(data2)

print()
# Flatten the data and create a list of dictionaries for each row
rows_list = []
for pdb_id, mutations in data2.items():
  if mutations is None or mutations[0] is None:
    continue
  else:
    for mutation in mutations:
      mutation_type, position, from_aa, to_aa = mutation
      rows_list.append({'PDB ID': pdb_id, 'Type': mutation_type, 'Position': position, 'From': from_aa, 'To': to_aa})

# Convert the list of dictionaries to a DataFrame
df2 = pd.DataFrame(rows_list)

# Display the DataFrame
num_rows = df2.shape[0]
df2.head(num_rows)




Unnamed: 0,PDB ID,Type,Position,From,To
0,6kao,Variant,7,GLU,LYS
1,6l5v,Variant,7,GLU,LYS
2,6l5w,Variant,7,GLU,LYS
3,6kap,Variant,7,GLU,LYS
4,6kaq,Variant,7,GLU,LYS
...,...,...,...,...,...
185,1yen,Engineered mutation,2,VAL,MET
186,1cmy,Conflict,100,ASP,TYR
187,1hbs,Conflict,7,GLU,VAL
188,1ye1,Engineered mutation,36,TYR,ALA
