In [4]:
from pathlib import Path
import yaml
from tqdm import tqdm
import avoidome.uniprot as uniprot
import avoidome.schema as schema
from importlib import reload

# Download

In [61]:
data_dir = Path('../data')
fig_dir = Path('../figures')
uniprot_dir = data_dir / 'uniprot_downloads'
af_dir = data_dir / 'alphafold_downloads'

## use curated data this time

In [6]:
with open(data_dir / 'admet_names_curated.yml') as f:
    adme_names = yaml.safe_load(f)

In [8]:
exp_structure_dict = {}
failed = []
success = []
for protein in tqdm(adme_names):
    uniprot_id = protein['uniprot']
    ue = uniprot.UniprotEntry.from_uniprot_id(uniprot_id)
    try:
        exp_structure_dict[ue.name] = ue.get_experimental_structures()
        success.append(ue)
    except:
        failed.append(ue)

100%|██████████| 57/57 [00:44<00:00,  1.28it/s]


In [9]:
len(failed)

0

In [10]:
len(success)

57

In [15]:
for ue in tqdm(success):
    yaml.safe_dump(ue.dict(), open(f"{uniprot_dir / ue.data['id']}.yml", 'w'))


  0%|          | 0/57 [00:00<?, ?it/s][A
  2%|▏         | 1/57 [00:00<00:19,  2.83it/s][A
  4%|▎         | 2/57 [00:00<00:16,  3.25it/s][A
  5%|▌         | 3/57 [00:01<00:22,  2.42it/s][A
  7%|▋         | 4/57 [00:01<00:18,  2.90it/s][A
  9%|▉         | 5/57 [00:01<00:17,  3.06it/s][A
 11%|█         | 6/57 [00:02<00:25,  2.03it/s][A
 12%|█▏        | 7/57 [00:02<00:20,  2.43it/s][A
 14%|█▍        | 8/57 [00:02<00:16,  2.92it/s][A
 16%|█▌        | 9/57 [00:03<00:13,  3.47it/s][A
 18%|█▊        | 10/57 [00:03<00:12,  3.86it/s][A
 19%|█▉        | 11/57 [00:03<00:12,  3.73it/s][A
 21%|██        | 12/57 [00:03<00:09,  4.54it/s][A
 23%|██▎       | 13/57 [00:03<00:09,  4.46it/s][A
 26%|██▋       | 15/57 [00:04<00:06,  6.49it/s][A
 28%|██▊       | 16/57 [00:04<00:06,  6.42it/s][A
 30%|██▉       | 17/57 [00:04<00:05,  6.92it/s][A
 32%|███▏      | 18/57 [00:04<00:05,  6.81it/s][A
 33%|███▎      | 19/57 [00:04<00:05,  6.70it/s][A
 35%|███▌      | 20/57 [00:04<00:05,  6.93it/s]

# Reload

In [7]:
entries = list(uniprot_dir.glob('*.yml'))
uniprot_entries = [uniprot.UniprotEntry.from_dict(yaml.safe_load(open(file, 'r'))) for file in tqdm(entries)]

100%|██████████| 57/57 [00:22<00:00,  2.57it/s]


# Get AlphaFold Structures

In [4]:
# i've made this return a list but it's only ever going to be one
af_structures = [u.get_alphafold_structures()[0] for u in tqdm(uniprot_entries)]

100%|██████████| 57/57 [00:07<00:00,  7.33it/s]


## Download the structures

In [5]:
from asapdiscovery.data.utils import download_file

In [6]:
for p in af_structures:
    if not (af_dir / f"{p.af_id}.cif").exists():
        download_file(p.model_url, af_dir / f"{p.af_id}.cif")

# Calculate the pLDDT

In [7]:
from asapdiscovery.data.openeye import load_openeye_cif, oechem
import numpy as np

In [8]:
def calculate_high_confidence(af_id):
    """
    Load the downloaded model with openeye, get the residues.
    Get the bfactor of each residue (which is the pLDDT score for this case)
    Calculate the number of residues with high confidence (>90)
    :param af_id: 
    :return: 
    """
    mol = load_openeye_cif(af_dir / f"{af_id}.cif")
    residues = {oechem.OEAtomGetResidue(atom) for atom in mol.GetAtoms() if oechem.OEHasResidue(atom)}
    bfactors = np.array([res.GetBFactor() for res in residues])
    return sum(bfactors > 90)

In [9]:
confidence_dict = {p.components[0].name: calculate_high_confidence(p.af_id) / p.components[0].length  for p in tqdm(af_structures)}

100%|██████████| 57/57 [00:04<00:00, 12.40it/s]


# Plot

## construct plotly df

In [62]:
category_dict = {e['uniprot']: e['admet_category'] for e in adme_names}

In [63]:
import pandas as pd

In [64]:
confidence_dict.keys()

NameError: name 'confidence_dict' is not defined

In [30]:
name_to_uniprot = {af.components[0].name: af.components[0].uniprot_id for af in af_structures}
uniprot_to_name = {v: k for k, v in name_to_uniprot.items()}

In [37]:
confidence_df = pd.DataFrame({'Protein Name': list(confidence_dict.keys()), 
                              'Fraction': list(confidence_dict.values()), 
                              'Category': [category_dict[name_to_uniprot[k]] for k in confidence_dict.keys()]})

In [56]:
confidence_df.sort_values('Protein Name', inplace=True)

In [41]:
import plotly.express as px

In [61]:
color_sequence = ["#00bf7d", "#00b4c5", "#c44601", "#2546f0", "#5928ed"] 

In [62]:
fig = px.bar(confidence_df,
             x='Protein Name',
                y='Fraction',
             labels={'x': 'Protein', 'y': 'Fraction'}, 
             title='Fraction of Sequence Modeled with Very High Confidence (pLDDT > 90)', 
             template='simple_white',
             width=1200,
             height=600,
             category_orders={'Category': ['metabolism_redox', 'metabolism', 'absorption', 'drug_transporter', 'toxicity']},
             color="Category",
             color_discrete_sequence=color_sequence,
             )





In [63]:
fig.show()

In [64]:
fig.write_image(Path('..') / 'figures' / 'alphafold_confidence.png')

In [65]:
len(confidence_dict)

57

# Combine info

In [66]:
data_dict = {af.components[0].name: af for af in af_structures}

In [67]:
len(data_dict)

57

# Analyze Experimental Structures

In [47]:
import avoidome.target as target
reload(target)

<module 'avoidome.target' from '/Users/alexpayne/Scientific_Projects/avoidome-analysis/avoidome/target.py'>

In [48]:
tsds = [target.TargetStructureData.from_uniprot_entry(ue) for ue in tqdm(uniprot_entries)]

100%|██████████| 57/57 [00:07<00:00,  7.15it/s]


In [49]:
len(tsds)

57

In [50]:
tsd = tsds[0]

In [51]:
tsd.target_name

'NR1I2_HUMAN'

In [52]:
tsd.n_experimental_structures

53

In [53]:
tsd.n_predicted_structures

1

In [54]:
tsd.average_coverage

0.6972437179375708

In [55]:
tsd.average_confidence

85.43

In [56]:
coverages = [exp_struc.sequence_coverage for exp_struc in tsd.experimental_structures]

In [57]:
px.histogram(coverages)

In [58]:
tsd.sequence_length

434

# Plot Experimental Structures

1) violin plot with dots only
2) color by category
2) size is the % sequence coverage

## make tidy dataframe

In [144]:
import avoidome.structures as structures
import pandas as pd
df = pd.concat([pd.DataFrame({'Protein Name': [f'{tsd.target_name.split("_")[0]} ({tsd.uniprot_id})' for exp_struc in tsd.structures],
                   'Category': [category_dict[tsd.uniprot_id] for exp_struc in tsd.structures],
                   'Average Sequence Coverage': tsd.average_coverage,
                   'Number of Experimental Structures': [tsd.n_experimental_structures for exp_struc in tsd.structures],
                  "Sequence Length": [tsd.sequence_length for exp_struc in tsd.structures],
                          "Resolution (A)": [exp_struc.resolution for exp_struc in tsd.experimental_structures] + [2],
                                    "Method": [exp_struc.method for exp_struc in tsd.experimental_structures] + ['AlphaFold'],
                   }) for tsd in tsds]) 


Mean of empty slice.


invalid value encountered in double_scalars



In [247]:
tsd_dict = {f'{tsd.target_name.split("_")[0]} ({tsd.uniprot_id})': tsd for tsd in tsds}
symbols = {'X-ray': 'diamond-tall-open', 'EM': 'circle', 'AlphaFold': 'asterisk'}
colors = {'metabolism_redox': '#d07c09', 'metabolism': '#ffb418', 'absorption': '#11efb7', 'drug_transporter': '#9553ff', 'toxicity': '#5e2bcb'}
category_list = sorted(list(colors.keys()))

# Map Protein Name to numerical values
df.sort_values(['Category', 'Protein Name'], inplace=True)
protein_names = df['Protein Name'].unique()
protein_name_mapping = {name: i for i, name in enumerate(protein_names)}
jitter_amount = 0.2

In [249]:
import plotly.graph_objects as go
import numpy as np
# Create traces for each unique value in 'Method'
traces = []

for category in category_list:
    for method in df['Method'].unique():
        df_subset = df[(df['Method'] == method) & (df['Category'] == category)]
        
        # marker_size = df_subset["Average Sequence Coverage"].apply(lambda x: x * 16 if not np.isnan(x) else 10)
        
        # Add random jitter to the x-axis values
        if method == 'AlphaFold':
            x_values = [protein_name_mapping[name] for name in df_subset['Protein Name']]
        else:
            jitter = np.random.uniform(low=-jitter_amount, high=jitter_amount, size=len(df_subset))
            x_values = [protein_name_mapping[name] + jitter[i] for i, name in enumerate(df_subset['Protein Name'])]
        trace = go.Scatter(
            x=x_values,
            y=df_subset['Resolution (A)'],
            mode='markers',
            name=f"{category} ({method})",
            marker=dict(size=10, color=colors[category], symbol=symbols[method], line_width=2, line_color='black'),
            showlegend=False if method == 'AlphaFold' else True,
            opacity=0 if method == 'AlphaFold' else 1,
        )
        traces.append(trace)
        
# Add text annotations for average sequence coverage
annotations = []
i = 0
for protein_name in protein_names:
    tsd = tsd_dict[protein_name]
    if not np.isnan(tsd.average_coverage):
        y_height = - 20 * (i % 5) - 10
        annotations.append(
            dict(
                x=protein_name_mapping[f'{tsd.target_name.split("_")[0]} ({tsd.uniprot_id})'],
                y=-0.5,
                text=f"{tsd.average_coverage:.2%}",
                showarrow=True,
                arrowhead=7,
                ax=20,
                ay=y_height,
            )
        )
        i += 1

# Create layout
layout = go.Layout(
    title="Protein Analysis",
    xaxis=dict(title='Protein Name',
               range=[-0.5, len(protein_name_mapping) + 0.5],
               tickvals=list(protein_name_mapping.values()), 
               ticktext=protein_names),
    yaxis=dict(title='Resolution(A)'),
    template='simple_white',
    width=1600,
    height=600,
    annotations=annotations
)

# Create figure
fig = go.Figure(data=traces, layout=layout)

# Show plot
fig.show()


Mean of empty slice.


invalid value encountered in double_scalars



In [163]:
fig.write_image(fig_dir / 'experimental_structure_analysis.png')
fig.write_image(fig_dir / 'experimental_structure_analysis.svg')

In [189]:
df[df["Protein Name"] == "ABCBB (O95342)"]

Unnamed: 0,Protein Name,Category,Average Sequence Coverage,Number of Experimental Structures,Sequence Length,Resolution (A),Method
3,ABCBB (O95342),drug_transporter,0.986374,3,1321,2.0,AlphaFold
2,ABCBB (O95342),drug_transporter,0.986374,3,1321,3.66,EM
1,ABCBB (O95342),drug_transporter,0.986374,3,1321,3.7,EM
0,ABCBB (O95342),drug_transporter,0.986374,3,1321,3.5,EM
