In [1]:
from pathlib import Path
import yaml
from tqdm import tqdm

# Load data

In [2]:
data_dir = Path('../data')

In [3]:
with open(data_dir / 'adme_names.yml') as f:
    adme_names = yaml.safe_load(f)

In [4]:
uniprot_dir = data_dir / 'uniprot_downloads'

## examine data

In [5]:
adme_names[0]

{'name': 'CYP1A2',
 'alternatives': 'CP1A2_HUMAN, Cytochrome P450 1A2',
 'uniprot': 'P05177'}

# Test a single target 

## get structures associated with a particular uniprot id

In [6]:
from avoidome.uniprot import UniprotEntry

In [7]:
uniprot_id = adme_names[0]['uniprot']
name = adme_names[0]['name']
ue = UniprotEntry.from_uniprot_id(uniprot_id)

In [8]:
import yaml
with open(f"{uniprot_dir / uniprot_id}.yml", 'w') as f:
    yaml.dump(ue.dict(), f)

In [9]:
ue.data['sequence']['sequence']

'MALSQSVPFSATELLLASAIFCLVFWVLKGLRPRVPKGLKSPPEPWGWPLLGHVLTLGKNPHLALSRMSQRYGDVLQIRIGSTPVLVLSRLDTIRQALVRQGDDFKGRPDLYTSTLITDGQSLTFSTDSGPVWAARRRLAQNALNTFSIASDPASSSSCYLEEHVSKEAKALISRLQELMAGPGHFDPYNQVVVSVANVIGAMCFGQHFPESSDEMLSLVKNTHEFVETASSGNPLDFFPILRYLPNPALQRFKAFNQRFLWFLQKTVQEHYQDFDKNSVRDITGALFKHSKKGPRASGNLIPQEKIVNLVNDIFGAGFDTVTTAISWSLMYLVTKPEIQRKIQKELDTVIGRERRPRLSDRPQLPYLEAFILETFRHSSFLPFTIPHSTTRDTTLNGFYIPKKCCVFVNQWQVNHDPELWEDPSEFRPERFLTADGTAINKPLSEKMMLFGMGKRRCIGEVLAKWEIFLFLAILLQQLEFSVPPGVKVDLTPIYGLTMKHARCEHVQARLRFSIN'

In [10]:
ue.get_experimental_structures()

[ExperimentalStructure(components=[ProteinEntity(name='CP1A2_HUMAN', uniprot_id='P05177', sequence='MALSQSVPFSATELLLASAIFCLVFWVLKGLRPRVPKGLKSPPEPWGWPLLGHVLTLGKNPHLALSRMSQRYGDVLQIRIGSTPVLVLSRLDTIRQALVRQGDDFKGRPDLYTSTLITDGQSLTFSTDSGPVWAARRRLAQNALNTFSIASDPASSSSCYLEEHVSKEAKALISRLQELMAGPGHFDPYNQVVVSVANVIGAMCFGQHFPESSDEMLSLVKNTHEFVETASSGNPLDFFPILRYLPNPALQRFKAFNQRFLWFLQKTVQEHYQDFDKNSVRDITGALFKHSKKGPRASGNLIPQEKIVNLVNDIFGAGFDTVTTAISWSLMYLVTKPEIQRKIQKELDTVIGRERRPRLSDRPQLPYLEAFILETFRHSSFLPFTIPHSTTRDTTLNGFYIPKKCCVFVNQWQVNHDPELWEDPSEFRPERFLTADGTAINKPLSEKMMLFGMGKRRCIGEVLAKWEIFLFLAILLQQLEFSVPPGVKVDLTPIYGLTMKHARCEHVQARLRFSIN', start=27, end=516)], pdb_id='2HI4', method='X-ray', resolution=1.95)]

In [12]:
ue.data['id']

'CP1A2_HUMAN'

In [13]:
exp_structure_dict = {}
failed = []
success = []
for protein in tqdm(adme_names):
    uniprot_id = protein['uniprot']
    ue = UniprotEntry.from_uniprot_id(uniprot_id)
    try:
        exp_structure_dict[ue.name] = ue.get_experimental_structures()
        success.append(ue)
    except:
        failed.append(ue)

100%|██████████| 51/51 [00:39<00:00,  1.28it/s]


In [14]:
len(failed)

0

In [15]:
len(success)

51

In [16]:
for ue in failed:
    yaml.safe_dump(ue.dict(), open(f"{uniprot_dir / ue.data['id']}_failed.yml", 'w'))

In [17]:
for ue in success:
    yaml.safe_dump(ue.dict(), open(f"{uniprot_dir / ue.data['id']}.yml", 'w'))

# Load just the failed ones

In [18]:
failed = list(uniprot_dir.glob('*_failed.yml'))

In [19]:
for file in failed:
    ue = UniprotEntry.from_dict(yaml.safe_load(open(file, 'r')))
    try:
        exp_structure_dict[ue.name] = ue.get_experimental_structures()
    except Exception as e:
        print(ue.name, ue.uniprot_id)
        print(e)

## looks like the new fix worked

# Load all the entries

In [20]:
from avoidome import uniprot
from importlib import reload
reload(uniprot)
uniprot_dir = data_dir / 'uniprot_downloads'

In [21]:
entries = list(uniprot_dir.glob('*HUMAN.yml'))

In [22]:
ue = [uniprot.UniprotEntry.from_dict(yaml.safe_load(open(file, 'r'))) for file in tqdm(entries)]

100%|██████████| 49/49 [00:17<00:00,  2.74it/s]


In [23]:
af_structures = []
for u in tqdm(ue):
    af_structures.append(u.get_alphafold_structures())

100%|██████████| 49/49 [00:07<00:00,  6.81it/s]


In [24]:
af_ids = [ps[0].af_id for ps in af_structures]

In [25]:
af_dir = data_dir / 'alphafold_downloads'
af_dir.mkdir(exist_ok=True)

In [26]:
r = uniprot.request_alphafold(u.uniprot_id)

In [27]:
r["structures"][0]["summary"]['model_url']

'https://alphafold.ebi.ac.uk/files/AF-P27338-F1-model_v4.cif'

In [28]:
af_structures[0][0].model_url

'https://alphafold.ebi.ac.uk/files/AF-O75469-F1-model_v4.cif'

## Download the actual models to get the pLDDT per residue

In [30]:
from asapdiscovery.data.utils import download_file

In [42]:
# flatten
ps = [p[0] for p in af_structures]

In [43]:
for p in ps:
        download_file(p.model_url, af_dir / f"{p.af_id}.cif")

In [44]:
from asapdiscovery.data.openeye import load_openeye_cif, oechem

In [45]:
import numpy as np

## Calculate the pLDDT

In [46]:
def calculate_high_confidence(af_id):
    """
    Load the downloaded model with openeye, get the residues.
    Get the bfactor of each residue (which is the pLDDT score for this case)
    Calculate the number of residues with high confidence (>90)
    :param af_id: 
    :return: 
    """
    mol = load_openeye_cif(af_dir / f"{af_id}.cif")
    residues = {oechem.OEAtomGetResidue(atom) for atom in mol.GetAtoms() if oechem.OEHasResidue(atom)}
    bfactors = np.array([res.GetBFactor() for res in residues])
    return sum(bfactors > 90)

In [47]:
confidence_dict = {p.components[0].name: calculate_high_confidence(p.af_id) / p.components[0].length  for p in ps}

In [48]:
import plotly.express as px

In [49]:
fig = px.bar(x=list(confidence_dict.keys()), 
             y=list(confidence_dict.values()), 
             labels={'x': 'Protein', 'y': 'Fraction'}, 
             title='Fraction of Sequence Modeled with Very High Confidence (pLDDT > 90)', 
             template='simple_white',
             width=1200,
             height=600,
             category_orders={'x': sorted(confidence_dict.keys())},
             )

In [50]:
fig.show()

In [51]:
fig.write_image(Path('..') / 'figures' / 'alphafold_confidence.png')

In [52]:
len(confidence_dict)

49