In [None]:
import numpy as np
import pandas as pd
import requests, sys
from pydantic import BaseModel, Field

requestURL = "https://www.ebi.ac.uk/proteins/api/proteins/P08183"

r = requests.get(requestURL, headers={ "Accept" : "application/json"})

if not r.ok:
  r.raise_for_status()
  sys.exit()

responseBody = r.text
uniprot_dict = r.json()

In [None]:
import yaml
yaml.safe_dump(uniprot_dict, open('P08183.yaml', 'w'))

In [None]:
'pdb' in r.text

In [None]:
def request_alphafold(uniprot_id):
    """
    A function to request a protein entry from the UniProt API
    """
    requestURL = f"https://alphafold.ebi.ac.uk/api/uniprot/summary/{uniprot_id}.json"
    r = requests.get(requestURL)
    if not r.ok:
        r.raise_for_status()
        sys.exit()
    return r.json()

In [None]:
def request_uniprot(uniprot_id):
    """
    A function to request a protein entry from the UniProt API
    """
    requestURL = f"https://www.ebi.ac.uk/proteins/api/proteins/{uniprot_id}"
    r = requests.get(requestURL, headers={ "Accept" : "application/json"})
    if not r.ok:
        r.raise_for_status()
        sys.exit()
    return r.json()

In [None]:
def parse_resolved_chains(chain_str):
    """
    A function to parse resolved chains for a particular uniprot id from a Uniprot string
    """
    chain_letters = chain_str.split('=')[0].split('/')
    start, end = chain_str.split('=')[1].split('-')
    
    resolved_chains = []
    for chain in chain_letters:
        resolved_chains.append(ResolvedChain(chain_id=chain, start=int(start), end=int(end)))
    return resolved_chains

In [None]:
class StructureEntry(BaseModel):
    """
    A class to represent a structure entry
    """
    uniprot_id: str = Field(..., title="The UniProt ID of the protein")

In [None]:
class ResolvedChain(BaseModel):
    """
    A class to represent a resolved chain
    """
    chain_id: str = Field(..., title="The chain ID")
    start: int = Field(..., title="The start position of the chain")
    end: int = Field(..., title="The end position of the chain")
    
    @property
    def length(self) -> int:
        return self.end - self.start

In [None]:
class ExperimentalStructure(StructureEntry):
    """
    A class to represent an experimental structure entry
    """
    pdb_id: str = Field(..., title="The PDB ID of the structure")
    method: str = Field(..., title="The method used to determine the structure")
    resolution: str = Field(..., title="The resolution (å) of the structure")
    resolved_chains: list[ResolvedChain] = Field(..., title="A list of resolved chains")
    
    @property
    def sequence_coverage(self):
        return np.mean([c.length for c in self.resolved_chains])

In [None]:
class PredictedStructure(StructureEntry):
    """
    A class to represent a predicted structure entry
    """
    af_id: str = Field(..., title="The AlphaFold DB ID of the structure")
    uniprot_start: int = Field(..., title="The start position of the structure")
    uniprot_end: int = Field(..., title="The end position of the structure")
    confidence: float = Field(..., title="The confidence of the structure")

In [None]:
class Target(BaseModel):
    """
    A class to represent a protein target
    """
    name: str = Field(..., title="The name of the protein")
    uniprot_id: str = Field(..., title="The UniProt ID of the protein")
    sequence: str = Field(..., title="The protein sequence")
    structures: list[StructureEntry] = Field(..., title="A list of structure entries")
    
    @property
    def sequence_length(self) -> int:
        return len(self.sequence)
    
    @property
    def experimental_structures(self):
        return [s for s in self.structures if isinstance(s, ExperimentalStructure)]
    
    @property
    def predicted_structures(self):
        return [s for s in self.structures if isinstance(s, PredictedStructure)]
    
    @property
    def average_coverage(self):
        return np.mean(np.array([s.sequence_coverage for s in self.experimental_structures]) / self.sequence_length)
    
    @property
    def average_confidence(self):
        return np.mean(np.array([s.confidence for s in self.predicted_structures]))
    
    @property
    def n_experimental_structures(self):
        return len([s for s in self.structures if isinstance(s, ExperimentalStructure)])
    
    @property
    def n_predicted_structures(self):
        return len([s for s in self.structures if isinstance(s, PredictedStructure)])

In [None]:
def parse_uniprot_accession(target_name:str, uniprot_id: str, uniprot_dict:dict) -> Target:
    """
    A function to parse the UniProt accession from a UniProt API response
    """
    pdb_ids = [ref for ref in uniprot_dict['dbReferences'] if ref['type'] == 'PDB']
    af_ids = [ref for ref in uniprot_dict['dbReferences'] if ref['type'] == 'AlphaFoldDB']
    
    refs = []
    for ref in pdb_ids:
        properties = ref['properties']
        refs.append(ExperimentalStructure(
            uniprot_id=uniprot_id,
            pdb_id=ref['id'],
            method=properties['method'],
            resolution=properties['resolution'].split(' ')[0],
            resolved_chains=parse_resolved_chains(properties['chains'])
        ))
    for ref in af_ids:
        r = request_alphafold(uniprot_id)
        summary_data = r['structures'][0]['summary']
        refs.append(PredictedStructure(
            uniprot_id=uniprot_id,
            af_id=summary_data['model_identifier'],
            uniprot_start=summary_data['uniprot_start'],
            uniprot_end=summary_data['uniprot_end'],
            confidence=summary_data['confidence_avg_local_score']
            
        ))
    return Target(
        name=target_name,
        uniprot_id=uniprot_id,
        sequence=uniprot_dict['sequence']['sequence'],
        structures=refs
    )

# Load the UniProt data

In [None]:
import yaml
from pathlib import Path

In [None]:
uniprot_data = yaml.safe_load(open(Path('transporter_names.yml')))

In [None]:
uniprot_data

In [None]:
targets = []
for info in uniprot_data:
    name = info['name']
    uniprot_id = info['uniprot']
    targets.append(parse_uniprot_accession(info['name'], uniprot_id, request_uniprot(uniprot_id)))

In [None]:
for target in targets:
    print(f"{target.name} ({target.uniprot_id}) has {target.n_experimental_structures} experimental structures with an average sequence coverage of {target.average_coverage:.2f} and {target.n_predicted_structures} with an average confidence of {target.average_confidence:.2f}")

In [None]:
import pandas as pd
df = pd.DataFrame({
    "Name": [t.name for t in targets],
    "UniProt ID": [t.uniprot_id for t in targets],
    "Sequence Length": [t.sequence_length for t in targets],
    "Number of Experimental Structures": [t.n_experimental_structures for t in targets],
    "Average Sequence Coverage": [t.average_coverage for t in targets],
    "Number of Predicted Structures": [t.n_predicted_structures for t in targets],
    "Average Confidence": [t.average_confidence for t in targets]
})

In [None]:
import plotly.express as px

In [None]:
for y in ["Number of Experimental Structures", "Average Sequence Coverage", "Average Confidence"]:
    fig = px.bar(df, x="Name", y=y, title=y, template="simple_white")
    fig.write_image(f"{y}.png")