In [1]:
from pathlib import Path
import yaml
from tqdm import tqdm
import avoidome.uniprot as uniprot
import avoidome.schema as schema
from importlib import reload

# Download

In [2]:
data_dir = Path('../data')
fig_dir = Path('../figures')
uniprot_dir = data_dir / 'uniprot_downloads'
af_dir = data_dir / 'alphafold_downloads'

## use curated data this time

In [3]:
with open(data_dir / 'admet_names_curated.yml') as f:
    adme_names = yaml.safe_load(f)

In [4]:
exp_structure_dict = {}
failed = []
success = []
for protein in tqdm(adme_names):
    uniprot_id = protein['uniprot']
    ue = uniprot.UniprotEntry.from_uniprot_id(uniprot_id)
    try:
        exp_structure_dict[ue.name] = ue.get_experimental_structures()
        success.append(ue)
    except:
        failed.append(ue)

100%|██████████| 57/57 [00:39<00:00,  1.45it/s]


In [5]:
len(failed)

0

In [6]:
len(success)

57

In [7]:
for ue in tqdm(success):
    yaml.safe_dump(ue.dict(), open(f"{uniprot_dir / ue.data['id']}.yml", 'w'))

100%|██████████| 57/57 [00:14<00:00,  3.97it/s]


# Reload

In [8]:
entries = list(uniprot_dir.glob('*.yml'))
uniprot_entries = [uniprot.UniprotEntry.from_dict(yaml.safe_load(open(file, 'r'))) for file in tqdm(entries)]

100%|██████████| 57/57 [00:26<00:00,  2.18it/s]


# Get AlphaFold Structures

In [9]:
# i've made this return a list but it's only ever going to be one
af_structures = [u.get_alphafold_structures()[0] for u in tqdm(uniprot_entries)]

100%|██████████| 57/57 [00:08<00:00,  6.92it/s]


## Download the structures

In [10]:
from asapdiscovery.data.utils import download_file

In [11]:
for p in af_structures:
    if not (af_dir / f"{p.af_id}.cif").exists():
        download_file(p.model_url, af_dir / f"{p.af_id}.cif")

# Calculate the pLDDT

In [12]:
from asapdiscovery.data.openeye import load_openeye_cif, oechem
import numpy as np

In [13]:
def calculate_high_confidence(af_id, af_dir, cutoff=90):
    """
    Load the downloaded model with openeye, get the residues.
    Get the bfactor of each residue (which is the pLDDT score for this case)
    Calculate the number of residues with high confidence (>90)
    :param af_id: 
    :return: 
    """
    mol = load_openeye_cif(af_dir / f"{af_id}.cif")
    residues = {oechem.OEAtomGetResidue(atom) for atom in mol.GetAtoms() if oechem.OEHasResidue(atom)}
    bfactors = np.array([res.GetBFactor() for res in residues])
    return sum(bfactors > cutoff)

In [14]:
confidence_dict = {p.components[0].name: calculate_high_confidence(p.af_id, af_dir, 90) / p.components[0].length  for p in tqdm(af_structures)}

100%|██████████| 57/57 [00:05<00:00, 10.95it/s]


# Plot

## construct plotly df

In [15]:
category_dict = {e['uniprot']: e['admet_category'] for e in adme_names}

In [16]:
import pandas as pd

In [17]:
confidence_dict.keys()

dict_keys(['NR1I2_HUMAN', 'ACM3_HUMAN', 'ARK73_HUMAN', 'CP2CJ_HUMAN', 'KCNH2_HUMAN', 'DHI1_HUMAN', 'FMO1_HUMAN', 'A1AG1_HUMAN', 'AOFA_HUMAN', 'ASM3A_HUMAN', 'SCN5A_HUMAN', 'AOXA_HUMAN', 'ADA2A_HUMAN', 'NR1I3_HUMAN', 'SO1B1_HUMAN', 'AL1A1_HUMAN', 'ACHA7_HUMAN', 'GSTA1_HUMAN', 'ACM2_HUMAN', 'CNR1_HUMAN', 'CACB1_HUMAN', 'CP2D6_HUMAN', 'S22A8_HUMAN', 'CP3A4_HUMAN', '5HT2B_HUMAN', 'ADRB2_HUMAN', 'ST1A1_HUMAN', 'CP1A2_HUMAN', 'ACM1_HUMAN', 'MDR1_HUMAN', 'SC6A3_HUMAN', 'ABCG2_HUMAN', 'CP2C9_HUMAN', 'ACHA9_HUMAN', 'ALBU_HUMAN', 'XDH_HUMAN', 'CNR2_HUMAN', 'CAC1C_HUMAN', 'GBRA1_HUMAN', 'CP2B6_HUMAN', 'ADRB1_HUMAN', 'S22A6_HUMAN', 'ADH1A_HUMAN', 'SO1B3_HUMAN', 'ACHA5_HUMAN', 'S47A1_HUMAN', 'SC6A2_HUMAN', 'ADA1A_HUMAN', 'MRP1_HUMAN', 'ACH10_HUMAN', 'SC6A4_HUMAN', 'ABCBB_HUMAN', 'S15A1_HUMAN', 'HRH1_HUMAN', 'ACHA3_HUMAN', 'AHR_HUMAN', 'AOFB_HUMAN'])

In [18]:
name_to_uniprot = {af.components[0].name: af.components[0].uniprot_id for af in af_structures}
uniprot_to_name = {v: k for k, v in name_to_uniprot.items()}

In [19]:
confidence_df = pd.DataFrame({'Protein Name': list(confidence_dict.keys()), 
                              'Fraction': list(confidence_dict.values()), 
                              'Category': [category_dict[name_to_uniprot[k]] for k in confidence_dict.keys()]})

In [20]:
confidence_df.sort_values('Protein Name', inplace=True)

In [21]:
import plotly.express as px

In [22]:
color_sequence = ["#00bf7d", "#00b4c5", "#c44601", "#2546f0", "#5928ed"] 

In [52]:
fig = px.bar(confidence_df,
             x='Protein Name',
                y='Fraction',
             labels={'x': 'Protein', 'y': 'Fraction'}, 
             title='Fraction of Sequence Modeled with Very High Confidence (pLDDT > 90)', 
             template='simple_white',
             width=1200,
             height=600,
             category_orders={'Category': ['metabolism', 'distribution', 'transporters', 'toxicity']},
             color="Category",
             color_discrete_sequence=color_sequence,
             )





In [54]:
fig.show()

In [55]:
fig.write_image(Path('..') / 'figures' / 'alphafold_confidence.png')

In [56]:
len(confidence_dict)

57

# Combine info

In [57]:
data_dict = {af.components[0].name: af for af in af_structures}

In [58]:
len(data_dict)

57

# Analyze Experimental Structures

In [59]:
import avoidome.target as target
reload(target)

<module 'avoidome.target' from '/Users/alexpayne/Scientific_Projects/avoidome-analysis/avoidome/target.py'>

In [60]:
tsds = [target.TargetStructureData.from_uniprot_entry(ue) for ue in tqdm(uniprot_entries)]

100%|██████████| 57/57 [00:07<00:00,  7.35it/s]


In [61]:
len(tsds)

57

In [62]:
tsd = tsds[0]

In [63]:
tsd.target_name

'NR1I2_HUMAN'

In [64]:
tsd.n_experimental_structures

53

In [65]:
tsd.n_predicted_structures

1

In [66]:
tsd.average_coverage

0.6972437179375708

In [67]:
tsd.average_confidence

85.43

In [68]:
coverages = [exp_struc.sequence_coverage for exp_struc in tsd.experimental_structures]

In [69]:
px.histogram(coverages)

In [70]:
tsd.sequence_length

434

# Plot Experimental Structures

1) violin plot with dots only
2) color by category
2) size is the % sequence coverage

## make tidy dataframe

In [71]:
import avoidome.structures as structures
import pandas as pd
df = pd.concat([pd.DataFrame({'Uniprot ID': tsd.uniprot_id,
    'Protein Name': [f'{tsd.target_name.split("_")[0]} ({tsd.uniprot_id})' for exp_struc in tsd.structures],
                   'Category': [category_dict[tsd.uniprot_id] for exp_struc in tsd.structures],
                   'Average Sequence Coverage': tsd.average_coverage,
                              "Sequence Coverage": [exp_struc.sequence_coverage / tsd.sequence_length for exp_struc in tsd.experimental_structures] + [tsd.average_coverage],
                   'Number of Experimental Structures': [tsd.n_experimental_structures for exp_struc in tsd.structures],
                  "Sequence Length": [tsd.sequence_length for exp_struc in tsd.structures],
                          "Resolution (A)": [exp_struc.resolution for exp_struc in tsd.experimental_structures] + [2],
                                    "Method": [exp_struc.method for exp_struc in tsd.experimental_structures] + ['AlphaFold'],
                   }) for tsd in tsds]) 


Mean of empty slice.


invalid value encountered in double_scalars



In [88]:
tsd_dict = {f'{tsd.target_name.split("_")[0]} ({tsd.uniprot_id})': tsd for tsd in tsds}
symbols = {'X-ray': 'diamond-tall', 'EM': 'circle', 'AlphaFold': 'circle-open'}
# colors = {'metabolism_redox': '#d07c09', 'metabolism': '#ffb418', 'absorption': '#11efb7', 'drug_transporter': '#9553ff', 'toxicity': '#5e2bcb'}
colors = {'metabolism': '#ffb418', 'distribution': '#11efb7', 'transporters': '#d07c09', 'toxicity': '#5e2bcb'}
category_list = sorted(list(colors.keys()))

# Map Protein Name to numerical values
df.sort_values(['Category', 'Protein Name'], inplace=True)
protein_names = df['Protein Name'].unique()
protein_name_mapping = {name: i for i, name in enumerate(protein_names)}
jitter_amount = 0.2

In [89]:
import plotly.graph_objects as go
import numpy as np
# Create traces for each unique value in 'Method'
resolution_traces = []

for category in category_list:
    for method in df['Method'].unique():
        df_subset = df[(df['Method'] == method) & (df['Category'] == category)]
        
        # marker_size = df_subset["Average Sequence Coverage"].apply(lambda x: x * 16 if not np.isnan(x) else 10)
        
        # Add random jitter to the x-axis values
        if method == 'AlphaFold':
            x_values = [protein_name_mapping[name] for name in df_subset['Protein Name']]
        else:
            jitter = np.random.uniform(low=-jitter_amount, high=jitter_amount, size=len(df_subset))
            x_values = [protein_name_mapping[name] + jitter[i] for i, name in enumerate(df_subset['Protein Name'])]
        trace = go.Scatter(
            x=x_values,
            y=df_subset['Resolution (A)'],
            mode='markers',
            name=f"{category} ({method})",
            marker=dict(size=10, color=colors[category], symbol=symbols[method], line_width=2 if method=="EM" else 1, line_color='black'),
            showlegend=False if method == 'AlphaFold' else True,
            opacity=0 if method == 'AlphaFold' else 1,
        )
        resolution_traces.append(trace)

# Create layout
layout = go.Layout(
    title="Analysis of Experimental Structure Quality for ADMET Targets",
    xaxis=dict(title='Protein Name (Uniprot ID)',
               range=[-0.5, len(protein_name_mapping) + 0.5],
               tickvals=list(protein_name_mapping.values()), 
               ticktext=protein_names),
    yaxis=dict(title='Resolution(A)'),
    template='simple_white',
    width=1600,
    height=600
)

# Create figure
fig = go.Figure(data=resolution_traces, layout=layout)

# Show plot
fig.show()

In [90]:
fig.write_image(fig_dir / 'experimental_structure_analysis.png')
fig.write_image(fig_dir / 'experimental_structure_analysis.svg')

# Make similar plot for coverage

In [91]:
# Create traces for each unique value in 'Method'
coverage_traces = []

for category in category_list:
    for method in df['Method'].unique():
        df_subset = df[(df['Method'] == method) & (df['Category'] == category)]
        
        # marker_size = df_subset["Average Sequence Coverage"].apply(lambda x: x * 16 if not np.isnan(x) else 10)
        
        # Add random jitter to the x-axis values
        if method == 'AlphaFold':
            x_values = [protein_name_mapping[name] for name in df_subset['Protein Name']]
        else:
            jitter = np.random.uniform(low=-jitter_amount, high=jitter_amount, size=len(df_subset))
            x_values = [protein_name_mapping[name] + jitter[i] for i, name in enumerate(df_subset['Protein Name'])]
        trace = go.Scatter(
            x=x_values,
            y=df_subset['Sequence Coverage'],
            mode='markers',
            name=f"{category} ({method})",
            marker=dict(size=10, color=colors[category], symbol=symbols[method], line_width=2 if method=="EM" else 1, line_color='black'),
            showlegend=False if method == 'AlphaFold' else True,
            opacity=0 if method == 'AlphaFold' else 1,
        )
        coverage_traces.append(trace)

# Create layout
layout = go.Layout(
    title="Analysis of Experimental Structure Quality for ADMET Targets",
    xaxis=dict(title='Protein Name (Uniprot ID)',
               range=[-0.5, len(protein_name_mapping) + 0.5],
               tickvals=list(protein_name_mapping.values()), 
               ticktext=protein_names),
    yaxis=dict(title='Sequence Coverage'),
    template='simple_white',
    width=1600,
    height=600
)

# Create figure
fig = go.Figure(data=coverage_traces, layout=layout)

# Show plot
fig.show()

# Make Combined Figure

In [92]:
# Create traces for each unique value in 'Method'
coverage_traces = []
resolution_traces = []
confidence_trace = []
for category in category_list:
    for method in df['Method'].unique():
        df_subset = df[(df['Method'] == method) & (df['Category'] == category)]
        
        # marker_size = df_subset["Average Sequence Coverage"].apply(lambda x: x * 16 if not np.isnan(x) else 10)
        
        # Add random jitter to the x-axis values
        if method == 'AlphaFold':
            x_values = [protein_name_mapping[name] for name in df_subset['Protein Name']]
            af_confidence = [confidence_dict[uniprot_to_name[uniprot]] for uniprot in df_subset['Uniprot ID']]
        else:
            jitter = np.random.uniform(low=-jitter_amount, high=jitter_amount, size=len(df_subset))
            x_values = [protein_name_mapping[name] + jitter[i] for i, name in enumerate(df_subset['Protein Name'])]
        
        name = f"{category} ({method})"
        marker_dict = dict(size=10, color=colors[category], symbol=symbols[method], line_width=2 if method=="EM" else 1, line_color='black')
        if not method == 'AlphaFold':
            trace = go.Scatter(
                x=x_values,
                y=df_subset['Resolution (A)'],
                mode='markers',
                name=category,
                marker=marker_dict,
                showlegend=True if method == 'X-ray' else False,
                opacity=1,
                legendgroup=category,
                legendgrouptitle=dict(text="ADMET Category (Color)") if category == category_list[0] else None,
            )
            resolution_traces.append(trace)
            
            trace = go.Scatter(
                x=x_values,
                y=df_subset['Sequence Coverage'],
                mode='markers',
                name=method,
                marker=marker_dict,
                showlegend=True if category == category_list[-1] else False,
                opacity=1,
                legendgroup=method,
                legendgrouptitle=dict(text="Method (Symbol)") if method == 'X-ray' else None,
            )
            coverage_traces.append(trace)
        
        elif method == 'AlphaFold':
            trace = go.Scatter(
                x=x_values,
                y=af_confidence,
                mode='markers',
                name=method,
                marker=marker_dict,
                showlegend=True if category == category_list[-1] else False,
                legendgroup=method,
            )
            confidence_trace.append(trace)
        

In [93]:
from plotly.subplots import make_subplots

In [94]:
# Create subplot figure with two rows and one column
fig = make_subplots(rows=3, cols=1, shared_xaxes=True, vertical_spacing=0.01, row_heights=[0.4, 0.3, 0.3])

In [95]:
for trace in resolution_traces:
    fig.add_trace(trace, row=1, col=1)
for trace in coverage_traces:
    fig.add_trace(trace, row=2, col=1)
for trace in confidence_trace:
    fig.add_trace(trace, row=3, col=1)

# get correct protein names

In [96]:
uniprot_to_grant_name = {}
for ref in adme_names:
    uniprot_to_grant_name[ref['uniprot']] = ref['grant_name']

In [97]:
# Create layout
layout = go.Layout(
    title="Analysis of Structural Data for ADMET Targets",
    xaxis3=dict(title='Protein Name (Uniprot ID)',
               range=[-0.5, len(protein_name_mapping) + 0.5],
               tickvals=list(protein_name_mapping.values()), 
               ticktext=[uniprot_to_grant_name[protein_name] for protein_name in df['Uniprot ID'].unique()]),
    yaxis=dict(title='Resolution (A)'),
    yaxis2=dict(title='Fraction of Sequence <br> Covered by Structure'),
    yaxis3=dict(title='Fraction Modeled with <br> Very High Confidence <br> (pLDDT > 90)'),
    template='simple_white',
    width=1600,
    height=800,
)
fig.update_layout(layout)
fig.show()

In [98]:
fig.write_image(fig_dir / 'combined_structure_analysis.png')
fig.write_image(fig_dir / 'combined_structure_analysis.svg')