In [None]:
import pandas as pd
import sys
from manuscript_notebooks.DMS_utils import plot_positions_hm, plot_risk_hist, prepare_full_df, plot_dms_heatmap, export_colored_pdb
sys.path.append("../")
from plots_interpretation import visualize_structure_continuous_color, create_plot_grid, plot_probs_along_seq

# Setup

Follow below steps then run full notebook:
- step 1 download 'full_dataset.csv' from OSF and place in 'inputs' folder
- step 2 download your protein pdb structure and place in 'inputs/structure files'
- step 3 edit below cell with your orgnaism/gene name (matching full dataset format where possible), protein_id, wt sequence and file path to your output predictions csv

In [None]:
selected_gene = 'your_gene'
selected_org = 'your_organism'
protein_id = 'your_id' # SHOULD BE THE PDB FILENAME
wt = 'your_wt_aa_sequence'
preds_path = 'inputs/dms_predict.csv'

In [None]:
---Run---

In [None]:
original_data_path = 'inputs/full_dataset.csv'
pdb_folder = f'inputs/structure_files/'

In [None]:
full_data = prepare_full_df(preds_path, original_data_path, wt, selected_org, selected_gene, protein_id)

In [None]:
full_data.head()

In [None]:
plot_probs_along_seq(full_data, protein_id, feature = 'binary_predictions',fill_missing=True) 

In [None]:
plot_probs_along_seq(full_data, protein_id, feature = 'binary_predictions',fill_missing=True, mode='points') 

In [None]:
plot_probs_along_seq(full_data, protein_id, feature = 'binary_predictions',fill_missing=True, mode='lines') 

In [None]:
plot_probs_along_seq(full_data, protein_id, fill_missing=True)

In [None]:
heatmap_df = plot_dms_heatmap(full_data, 'binary_predictions', selected_org, selected_gene, figsize=(50, 16))

In [None]:
heatmap_df_probs = plot_dms_heatmap(full_data, 'probabilities', selected_org, selected_gene, figsize=(50, 16))

In [None]:
view = visualize_structure_continuous_color(pdb_file = f"{pdb_folder}{protein_id}.pdb", 
                                             value_df= full_data[full_data['uniprot_id']==protein_id], 
                                             value_col= 'probabilities',
                                             position_col = 'position', 
                                             chain= 'A',colormap= "viridis")

In [None]:
# this will save pdb file with prediciton values to load in pymol if needed
export_colored_pdb(
    pdb_file=f"{pdb_folder}{protein_id}.pdb",
    value_df=full_data[full_data['uniprot_id']==protein_id],
    value_col="probabilities",
    position_col="position",
    chain="A",
    out_pdb=f"colored_{selected_org}_{selected_gene}.pdb"
)

In [None]:
plot_risk_hist(
    full_data['probabilities'],
    selected_gene,
    selected_org,
    figsize=(12,6),
    cmap="viridis",
    alpha=0.8
)

In [None]:
full_data.risk.value_counts()/full_data.shape[0]*100

In [None]:
full_data.risk.value_counts()

In [None]:
def plot_random_position_heatmap(risk_group, full_data, heatmap_df, wt, selected_org, selected_gene):
    position = list(full_data[(full_data['tag']!='original')&(full_data['risk']==risk_group)].sort_values(by='probabilities', ascending=False)['position'])[0]
    posititon_df = pd.DataFrame(heatmap_df[position])
    plot_positions_hm(posititon_df, risk_group, selected_org, selected_gene)
    print(f'Most likely resistant = {posititon_df.idxmax().item()}')
    wt_res = wt[posititon_df.columns.item()-1]
    print(f'WT residue = {wt_res}')
    wt_drop = posititon_df.drop(index=wt_res)
    print(f'Least likely resistant (not wt) = {wt_drop.idxmin().item()}')

In [None]:
plot_random_position_heatmap('Very High', full_data, heatmap_df_probs, wt, selected_org, selected_gene)

In [None]:
plot_random_position_heatmap('High', full_data, heatmap_df_probs, wt, selected_org, selected_gene)

In [None]:
plot_random_position_heatmap('Medium', full_data, heatmap_df_probs, wt, selected_org, selected_gene)

In [None]:
plot_random_position_heatmap('Low', full_data, heatmap_df_probs, wt, selected_org, selected_gene)

In [None]:
plot_random_position_heatmap('Very Low', full_data, heatmap_df_probs, wt, selected_org, selected_gene)