In [1]:
from pathlib import Path
import yaml
from tqdm import tqdm
import avoidome.uniprot as uniprot
import avoidome.schema as schema
from importlib import reload

# Download

In [2]:
data_dir = Path('../data')
fig_dir = Path('../figures')
uniprot_dir = data_dir / 'uniprot_downloads'
af_dir = data_dir / 'alphafold_downloads'

## use curated data this time

In [18]:
with open(data_dir / 'admet_names_curated.yml') as f:
    adme_names = yaml.safe_load(f)

In [8]:
exp_structure_dict = {}
failed = []
success = []
for protein in tqdm(adme_names):
    uniprot_id = protein['uniprot']
    ue = uniprot.UniprotEntry.from_uniprot_id(uniprot_id)
    try:
        exp_structure_dict[ue.name] = ue.get_experimental_structures()
        success.append(ue)
    except:
        failed.append(ue)

100%|██████████| 57/57 [00:44<00:00,  1.28it/s]


In [9]:
len(failed)

0

In [10]:
len(success)

57

In [15]:
for ue in tqdm(success):
    yaml.safe_dump(ue.dict(), open(f"{uniprot_dir / ue.data['id']}.yml", 'w'))


  0%|          | 0/57 [00:00<?, ?it/s][A
  2%|▏         | 1/57 [00:00<00:19,  2.83it/s][A
  4%|▎         | 2/57 [00:00<00:16,  3.25it/s][A
  5%|▌         | 3/57 [00:01<00:22,  2.42it/s][A
  7%|▋         | 4/57 [00:01<00:18,  2.90it/s][A
  9%|▉         | 5/57 [00:01<00:17,  3.06it/s][A
 11%|█         | 6/57 [00:02<00:25,  2.03it/s][A
 12%|█▏        | 7/57 [00:02<00:20,  2.43it/s][A
 14%|█▍        | 8/57 [00:02<00:16,  2.92it/s][A
 16%|█▌        | 9/57 [00:03<00:13,  3.47it/s][A
 18%|█▊        | 10/57 [00:03<00:12,  3.86it/s][A
 19%|█▉        | 11/57 [00:03<00:12,  3.73it/s][A
 21%|██        | 12/57 [00:03<00:09,  4.54it/s][A
 23%|██▎       | 13/57 [00:03<00:09,  4.46it/s][A
 26%|██▋       | 15/57 [00:04<00:06,  6.49it/s][A
 28%|██▊       | 16/57 [00:04<00:06,  6.42it/s][A
 30%|██▉       | 17/57 [00:04<00:05,  6.92it/s][A
 32%|███▏      | 18/57 [00:04<00:05,  6.81it/s][A
 33%|███▎      | 19/57 [00:04<00:05,  6.70it/s][A
 35%|███▌      | 20/57 [00:04<00:05,  6.93it/s]

# Reload

In [3]:
entries = list(uniprot_dir.glob('*.yml'))
uniprot_entries = [uniprot.UniprotEntry.from_dict(yaml.safe_load(open(file, 'r'))) for file in tqdm(entries)]

100%|██████████| 57/57 [00:22<00:00,  2.53it/s]


# Get AlphaFold Structures

In [4]:
# i've made this return a list but it's only ever going to be one
af_structures = [u.get_alphafold_structures()[0] for u in tqdm(uniprot_entries)]

100%|██████████| 57/57 [00:07<00:00,  7.33it/s]


## Download the structures

In [5]:
from asapdiscovery.data.utils import download_file

In [6]:
for p in af_structures:
    if not (af_dir / f"{p.af_id}.cif").exists():
        download_file(p.model_url, af_dir / f"{p.af_id}.cif")

# Calculate the pLDDT

In [7]:
from asapdiscovery.data.openeye import load_openeye_cif, oechem
import numpy as np

In [8]:
def calculate_high_confidence(af_id):
    """
    Load the downloaded model with openeye, get the residues.
    Get the bfactor of each residue (which is the pLDDT score for this case)
    Calculate the number of residues with high confidence (>90)
    :param af_id: 
    :return: 
    """
    mol = load_openeye_cif(af_dir / f"{af_id}.cif")
    residues = {oechem.OEAtomGetResidue(atom) for atom in mol.GetAtoms() if oechem.OEHasResidue(atom)}
    bfactors = np.array([res.GetBFactor() for res in residues])
    return sum(bfactors > 90)

In [9]:
confidence_dict = {p.components[0].name: calculate_high_confidence(p.af_id) / p.components[0].length  for p in tqdm(af_structures)}

100%|██████████| 57/57 [00:04<00:00, 12.40it/s]


# Plot

## construct plotly df

In [19]:
category_dict = {e['uniprot']: e['admet_category'] for e in adme_names}

In [21]:
import pandas as pd

In [22]:
confidence_dict.keys()

dict_keys(['NR1I2_HUMAN', 'ACM3_HUMAN', 'ARK73_HUMAN', 'CP2CJ_HUMAN', 'KCNH2_HUMAN', 'DHI1_HUMAN', 'FMO1_HUMAN', 'A1AG1_HUMAN', 'AOFA_HUMAN', 'ASM3A_HUMAN', 'SCN5A_HUMAN', 'AOXA_HUMAN', 'ADA2A_HUMAN', 'NR1I3_HUMAN', 'SO1B1_HUMAN', 'AL1A1_HUMAN', 'ACHA7_HUMAN', 'GSTA1_HUMAN', 'ACM2_HUMAN', 'CNR1_HUMAN', 'CACB1_HUMAN', 'OXA1L_HUMAN', 'CP2D6_HUMAN', 'S22A8_HUMAN', 'CP3A4_HUMAN', '5HT2B_HUMAN', 'ADRB2_HUMAN', 'ST1A1_HUMAN', 'CP1A2_HUMAN', 'ACM1_HUMAN', 'MDR1_HUMAN', 'SC6A3_HUMAN', 'ABCG2_HUMAN', 'CP2C9_HUMAN', 'ACHA9_HUMAN', 'XDH_HUMAN', 'CNR2_HUMAN', 'CAC1C_HUMAN', 'GBRA1_HUMAN', 'CP2B6_HUMAN', 'ADRB1_HUMAN', 'S22A6_HUMAN', 'ADH1A_HUMAN', 'SO1B3_HUMAN', 'ACHA5_HUMAN', 'S47A1_HUMAN', 'SC6A2_HUMAN', 'ADA1A_HUMAN', 'MRP1_HUMAN', 'ACH10_HUMAN', 'SC6A4_HUMAN', 'ABCBB_HUMAN', 'S15A1_HUMAN', 'HRH1_HUMAN', 'ACHA3_HUMAN', 'AHR_HUMAN', 'AOFB_HUMAN'])

In [30]:
name_to_uniprot = {af.components[0].name: af.components[0].uniprot_id for af in af_structures}
uniprot_to_name = {v: k for k, v in name_to_uniprot.items()}

In [37]:
confidence_df = pd.DataFrame({'Protein Name': list(confidence_dict.keys()), 
                              'Fraction': list(confidence_dict.values()), 
                              'Category': [category_dict[name_to_uniprot[k]] for k in confidence_dict.keys()]})

In [56]:
confidence_df.sort_values('Protein Name', inplace=True)

In [57]:
import plotly.express as px

In [61]:
color_sequence = ["#00bf7d", "#00b4c5", "#c44601", "#2546f0", "#5928ed"] 

In [62]:
fig = px.bar(confidence_df,
             x='Protein Name',
                y='Fraction',
             labels={'x': 'Protein', 'y': 'Fraction'}, 
             title='Fraction of Sequence Modeled with Very High Confidence (pLDDT > 90)', 
             template='simple_white',
             width=1200,
             height=600,
             category_orders={'Category': ['metabolism_redox', 'metabolism', 'absorption', 'drug_transporter', 'toxicity']},
             color="Category",
             color_discrete_sequence=color_sequence,
             )





In [63]:
fig.show()

In [64]:
fig.write_image(Path('..') / 'figures' / 'alphafold_confidence.png')

In [65]:
len(confidence_dict)

57

# Combine info

In [66]:
data_dict = {af.components[0].name: af for af in af_structures}

In [67]:
len(data_dict)

57

# Analyze Experimental Structures