In [4]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import plotly.express as px


In [5]:
# Define the path to the CSV file
csv_file_path = "../../Jazzer_surf/3d_predictions/chimeras/test/interactions_all_files.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Take only the rows where the resn is MG and the interacting_resn is ATP or ADP
df = df[(df['resn'] == 'MG') & ((df['interacting_resn'] == 'ADP') | (df['interacting_resn'] == 'ATP'))]

# Order the DataFrame by the column interacting_atom but in the order PA, O1A, O2A, O3A, PB, O1B, O2B, O3B, PG, O1G, O2G, O3G
atom_order = ['PA', 'O1A', 'O2A', 'O3A', 'PB', 'O1B', 'O2B', 'O3B', 'PG', 'O2G', 'O1G', 'O3G']
df['atom_order'] = pd.Categorical(df['interacting_atom'], categories=atom_order, ordered=True)
df = df.sort_values(by='atom_order')

# Create an interactive strip plot using Plotly
fig = px.strip(df, x='interacting_atom', y='distance (angstroms)', color='file',
               category_orders={"interacting_atom": atom_order},
               title='Distance between MG and ATP/ADP atoms',
               labels={'interacting_atom': 'Interacting atom', 'distance (angstroms)': 'Distance (angstroms)'},
               hover_data=['file'])

# Update the layout to fix the y-axis and ensure all x-axis values are shown
fig.update_layout(
    legend_title_text='File',
    legend=dict(
        title=dict(text='File'),
        itemsizing='constant'
    ),
    xaxis_title='Interacting atom',
    yaxis_title='Distance (angstroms)',
    yaxis=dict(range=[0, 5]),
    xaxis=dict(categoryorder='array', categoryarray=atom_order)
)

# Show the plot
fig.show()


In [6]:
# Define the path to the CSV file
csv_file_path = "../../Jazzer_surf/3d_predictions/chimeras/test/interactions_all_files.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

df = df.loc[df.groupby(['file', 'chain', 'resi', 'resn', 'interacting_chain', 'interacting_resn'])['distance (angstroms)'].idxmin()]

# combine the "resi" and "residue_one_letter" into a new column called "residue"
df['residue'] = df['resi'].astype(str) + df['residue_one_letter']

df.head()

Unnamed: 0,file,chain,resi,resn,atom_name,interacting_atom,interacting_resn,interacting_chain,distance (angstroms),residue_one_letter,full_atom_name,interacting_full_atom_name,residue
8315,fold_k401_adp_model_0,A,18,ARG,NH2,N6,ADP,C,3.788445,R,Nitrogen eta 2,Nitrogen 6,18R
7153,fold_k401_adp_model_0,A,19,PHE,O,N6,ADP,C,4.456158,F,Oxygen,Nitrogen 6,19F
7693,fold_k401_adp_model_0,A,20,ARG,CB,N7,ADP,C,3.454982,R,Carbon beta,Nitrogen 7,20R
8091,fold_k401_adp_model_0,A,21,PRO,CD,N1,ADP,C,3.42373,P,Carbon delta,Nitrogen 1,21P
8566,fold_k401_adp_model_0,A,61,PRO,O,N6,ADP,C,4.473644,P,Oxygen,Nitrogen 6,61P


In [8]:
df.loc[df['file'] == 'fold_k401_adp_model_0']

Unnamed: 0,file,chain,resi,resn,atom_name,interacting_atom,interacting_resn,interacting_chain,distance (angstroms),residue_one_letter,full_atom_name,interacting_full_atom_name,residue
8315,fold_k401_adp_model_0,A,18,ARG,NH2,N6,ADP,C,3.788445,R,Nitrogen eta 2,Nitrogen 6,18R
7153,fold_k401_adp_model_0,A,19,PHE,O,N6,ADP,C,4.456158,F,Oxygen,Nitrogen 6,19F
7693,fold_k401_adp_model_0,A,20,ARG,CB,N7,ADP,C,3.454982,R,Carbon beta,Nitrogen 7,20R
8091,fold_k401_adp_model_0,A,21,PRO,CD,N1,ADP,C,3.423730,P,Carbon delta,Nitrogen 1,21P
8566,fold_k401_adp_model_0,A,61,PRO,O,N6,ADP,C,4.473644,P,Oxygen,Nitrogen 6,61P
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8547,fold_k401_adp_model_0,D,1,MG,MG,CE,LYS,A,4.960289,?,Magnesium,Carbon epsilon,1?
8163,fold_k401_adp_model_0,D,1,MG,MG,OG,SER,A,4.723587,?,Magnesium,Oxygen gamma,1?
7888,fold_k401_adp_model_0,D,1,MG,MG,OG1,THR,A,2.127725,?,Magnesium,Oxygen gamma 1,1?
7665,fold_k401_adp_model_0,D,1,MG,MG,O3B,ADP,C,2.196621,?,Magnesium,Oxygen 3 beta,1?


In [16]:
df

Unnamed: 0,file,chain,resi,resn,atom_name,interacting_atom,interacting_resn,interacting_chain,distance (angstroms),residue_one_letter,full_atom_name,interacting_full_atom_name,residue
8315,fold_k401_adp_model_0,A,18,ARG,NH2,N6,ADP,C,3.788445,R,Nitrogen eta 2,Nitrogen 6,18R
7153,fold_k401_adp_model_0,A,19,PHE,O,N6,ADP,C,4.456158,F,Oxygen,Nitrogen 6,19F
7693,fold_k401_adp_model_0,A,20,ARG,CB,N7,ADP,C,3.454982,R,Carbon beta,Nitrogen 7,20R
8091,fold_k401_adp_model_0,A,21,PRO,CD,N1,ADP,C,3.423730,P,Carbon delta,Nitrogen 1,21P
8566,fold_k401_adp_model_0,A,61,PRO,O,N6,ADP,C,4.473644,P,Oxygen,Nitrogen 6,61P
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4178,fold_k401_atp_tub_model_0,K,1,MG,MG,OD2,ASP,D,4.218962,?,Magnesium,Oxygen delta 2,1?
1726,fold_k401_atp_tub_model_0,K,1,MG,MG,OE1,GLN,D,2.730962,?,Magnesium,Oxygen epsilon 1,1?
3904,fold_k401_atp_tub_model_0,K,1,MG,MG,MG,MG,K,0.000000,?,Magnesium,Magnesium,1?
2338,fold_k401_atp_tub_model_0,L,1,MG,MG,O2B,ADP,F,2.311070,?,Magnesium,Oxygen 2 beta,1?


In [4]:
#df loc of chain A with chain C and file a_atp_mg
df = df[(df['chain'] == 'A') & (df['interacting_chain'] == 'C') & (df['file'] == 'a_atp_mg')]

df['residue'].values

array(['18R', '19F', '20R', '21P', '61P', '93Q', '94T', '95S', '96S',
       '97G', '98K', '99T', '100H', '101T', '205N', '208S', '209S',
       '238D', '239L', '240A', '241G'], dtype=object)

In [29]:
import pandas as pd
import plotly.express as px

# Define the path to the CSV file
csv_file_path = "../../Jazzer_surf/3d_predictions/chimeras/test/interactions_all_files.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Filter to get the shortest distances for each interaction
df = df.loc[df[df['chain'] == 'A'].groupby(['resi', 'interacting_chain'])['distance (angstroms)'].idxmin()]

# Exclude interactions with Mg and chain letters A, G, H
df = df[(df['interacting_resn'] != 'MG') & (~df['interacting_chain'].isin(['A', 'G', 'H']))]

# Map interacting_resn to single-letter codes for better readability
aa_dict = {
    'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G',
    'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
    'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
}
df['residue_one_letter'] = df['resn'].map(aa_dict)
df['residue'] = df['resi'].astype(str) + df['residue_one_letter']

# Replace chain letters with descriptive names
chain_map = {
    'B': 'kinesin B',
    'C': 'alpha-tubulin',
    'D': 'beta-tubulin',
    'E': 'ATP',
    'F': 'ATP'
}
df['interacting_chain'] = df['interacting_chain'].map(chain_map)

# Create the interactive plot
fig = px.scatter(df, x='resi', y='distance (angstroms)', color='interacting_chain', text='residue_one_letter',
                 labels={'resi': 'Residue Index', 'distance (angstroms)': 'Distance (angstroms)', 'interacting_chain': 'Interacting Chain'},
                 title="Distance between MG and ATP/ADP atoms by amino acid position")

# Update layout for better visualization
fig.update_traces(textposition='top center', marker=dict(size=10, opacity=0.6))
fig.update_layout(
    legend_title_text='Interacting Chain',
    xaxis_range=[0, 401],  # Set a fixed range for the x-axis
    yaxis_range=[0, 5.25]  # Set a fixed range for the y-axis
)

# Show the plot
fig.show()


In [30]:
import pandas as pd
import plotly.express as px

# Define the path to the CSV file
csv_file_path = "../../Jazzer_surf/3d_predictions/chimeras/test/interactions_all_files.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Function to create a plot for each file
def create_plots(df):
    unique_files = df['file'].unique()
    plots = []

    for file in unique_files:
        df_file = df[df['file'] == file]

        # Filter to get the shortest distances for each interaction
        df_filtered = df_file.loc[df_file[df_file['chain'] == 'A'].groupby(['resi', 'interacting_chain'])['distance (angstroms)'].idxmin()]

        # Exclude interactions with Mg and chain letters A, G, H
        df_filtered = df_filtered[(df_filtered['interacting_resn'] != 'MG') & (~df_filtered['interacting_chain'].isin(['A', 'G', 'H']))]

        # Map interacting_resn to single-letter codes for better readability
        aa_dict = {
            'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G',
            'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
            'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
        }
        df_filtered['residue_one_letter'] = df_filtered['resn'].map(aa_dict)
        df_filtered['residue'] = df_filtered['resi'].astype(str) + df_filtered['residue_one_letter']

        # Replace chain letters with descriptive names
        chain_map = {
            'B': 'kinesin B',
            'C': 'alpha-tubulin',
            'D': 'beta-tubulin',
            'E': 'ATP',
            'F': 'ATP'
        }
        df_filtered['interacting_chain'] = df_filtered['interacting_chain'].map(chain_map)

        # Create the interactive plot
        fig = px.scatter(df_filtered, x='resi', y='distance (angstroms)', color='interacting_chain', text='residue_one_letter',
                         labels={'resi': 'Residue Index', 'distance (angstroms)': 'Distance (angstroms)', 'interacting_chain': 'Interacting Chain'},
                         title=f"Distance between MG and ATP/ADP atoms by amino acid position for {file}")

        # Update layout for better visualization
        fig.update_traces(textposition='top center', marker=dict(size=10, opacity=0.6))
        fig.update_layout(
            legend_title_text='Interacting Chain',
            xaxis_range=[0, 401],  # Set a fixed range for the x-axis
            yaxis_range=[0, 5.25]  # Set a fixed range for the y-axis
        )

        # Show the plot
        fig.show()

        plots.append(fig)

    return plots

# Create and display the plots
plots = create_plots(df)


In [31]:
import pandas as pd
import plotly.express as px

# Define the path to the CSV file
csv_file_path = "../../Jazzer_surf/3d_predictions/chimeras/test/interactions_all_files.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Function to create a plot for each file
def create_plots(df):
    unique_files = df['file'].unique()
    plots = []

    for file in unique_files:
        df_file = df[df['file'] == file]

        # Filter to get the shortest distances for each interaction
        df_filtered = df_file.loc[df_file[df_file['chain'] == 'A'].groupby(['resi', 'interacting_chain'])['distance (angstroms)'].idxmin()]

        # Exclude interactions with Mg and chain letters A, G, H
        df_filtered = df_filtered[(df_filtered['interacting_resn'] != 'MG') & (~df_filtered['interacting_chain'].isin(['A', 'G', 'H']))]

        # Map interacting_resn to single-letter codes for better readability
        aa_dict = {
            'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G',
            'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
            'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
        }
        df_filtered['residue_one_letter'] = df_filtered['resn'].map(aa_dict)
        df_filtered['residue'] = df_filtered['resi'].astype(str) + df_filtered['residue_one_letter']

        # Determine which chain map to use based on the file name
        if 'tub' in file.lower():
            chain_map = {
                'B': 'kinesin B',
                'C': 'alpha-tubulin',
                'D': 'beta-tubulin',
                'E': 'ATP/ADP of interest',
                'F': 'Non important',
                'G': 'Non important',
                'I': 'Magnesium ion',
                'J': 'Non important',
                'K': 'Non important',
                'L': 'Non important'
            }
        else:
            chain_map = {
                'B': 'kinesin B',
                'C': 'ATP/ADP of interest',
                'D': 'Magnesium ion'
            }

        # Apply the chain mapping
        df_filtered['interacting_chain'] = df_filtered['interacting_chain'].map(chain_map)

        # Create the interactive plot
        fig = px.scatter(df_filtered, x='resi', y='distance (angstroms)', color='interacting_chain', text='residue_one_letter',
                         labels={'resi': 'Residue Index', 'distance (angstroms)': 'Distance (angstroms)', 'interacting_chain': 'Interacting Chain'},
                         title=f"Distance between MG and ATP/ADP atoms by amino acid position for {file}")

        # Update layout for better visualization
        fig.update_traces(textposition='top center', marker=dict(size=10, opacity=0.6))
        fig.update_layout(
            legend_title_text='Interacting Chain',
            xaxis_range=[0, 401],  # Set a fixed range for the x-axis
            yaxis_range=[0, 5.25]  # Set a fixed range for the y-axis
        )

        # Show the plot
        fig.show()

        plots.append(fig)

    return plots

# Create and display the plots
plots = create_plots(df)


In [37]:
import pandas as pd
import plotly.express as px

# Define the path to the CSV file
csv_file_path = "../../Jazzer_surf/3d_predictions/chimeras/test/interactions_all_files.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Function to create a plot for each file and extract sequences
def create_plots_and_extract_sequences(df):
    unique_files = df['file'].unique()
    plots = []
    sequences = {}

    for file in unique_files:
        df_file = df[df['file'] == file]

        # Filter to get the shortest distances for each interaction
        df_filtered = df_file.loc[df_file[df_file['chain'] == 'A'].groupby(['resi', 'interacting_chain'])['distance (angstroms)'].idxmin()]

        # Exclude interactions with Mg and chain letters A, G, H
        df_filtered = df_filtered[(df_filtered['interacting_resn'] != 'MG') & (~df_filtered['interacting_chain'].isin(['A', 'G', 'H']))]

        # Map interacting_resn to single-letter codes for better readability
        aa_dict = {
            'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G',
            'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
            'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
        }
        df_filtered['residue_one_letter'] = df_filtered['resn'].map(aa_dict)
        df_filtered['residue'] = df_filtered['resi'].astype(str) + df_filtered['residue_one_letter']

        # Determine which chain map to use based on the file name
        if 'tub' in file.lower():
            chain_map = {
                'B': 'kinesin B',
                'C': 'alpha-tubulin',
                'D': 'beta-tubulin',
                'E': 'ATP/ADP of interest',
                'F': 'Non important',
                'G': 'Non important',
                'I': 'Magnesium ion',
                'J': 'Non important',
                'K': 'Non important',
                'L': 'Non important'
            }
        else:
            chain_map = {
                'B': 'kinesin B',
                'C': 'ATP/ADP of interest',
                'D': 'Magnesium ion'
            }

        # Apply the chain mapping
        df_filtered['interacting_chain'] = df_filtered['interacting_chain'].map(chain_map)

        # Extract sequences for each interacting chain
        file_sequences = {}
        for chain in df_filtered['interacting_chain'].unique():
            chain_residues = df_filtered[df_filtered['interacting_chain'] == chain]
            sorted_residues = chain_residues.sort_values('resi')
            sequence = ''.join(sorted_residues['residue_one_letter'].tolist())
            file_sequences[chain] = sequence

        sequences[file] = file_sequences

        # Create the interactive plot
        fig = px.scatter(df_filtered, x='resi', y='distance (angstroms)', color='interacting_chain', text='residue_one_letter',
                         labels={'resi': 'Residue Index', 'distance (angstroms)': 'Distance (angstroms)', 'interacting_chain': 'Interacting Chain'},
                         title=f"Distance between MG and ATP/ADP atoms by amino acid position for {file}")

        # Update layout for better visualization
        fig.update_traces(textposition='top center', marker=dict(size=10, opacity=0.6),
                          hovertemplate='<b>Residue:</b> %{text}<br><b>Distance:</b> %{y:.2f} Å<br><b>Chain:</b> %{customdata[0]}<br><b>Sequence:</b> %{customdata[1]}')
        fig.update_layout(
            legend_title_text='Interacting Chain',
            xaxis_range=[0, 401],  # Set a fixed range for the x-axis
            yaxis_range=[0, 5.25]  # Set a fixed range for the y-axis
        )

        # Show the plot
        fig.show()

        # Print the sequences
        print(f"Sequences for {file}:")
        for chain, sequence in file_sequences.items():
            print(f"{chain}: {sequence}")
        print("\n")

        plots.append(fig)

    return plots, sequences

# Create and display the plots and extract sequences
plots, sequences = create_plots_and_extract_sequences(df)

# Display the extracted sequences
sequences


Sequences for fold_k401_atp_model_0:
ATP/ADP of interest: RRPQTSSGKTHS
kinesin B: KNLTAEWKRYEKNLKKVLEELWRQIMLS




Sequences for fold_k401_atp_tub_model_0:
ATP/ADP of interest: RRPQTSSGKTHNSSG
beta-tubulin: KKNVHEDKRNTHRDR
kinesin B: KNLAEWKRYEKNLKVLEELWRQIML
alpha-tubulin: SKVAELKNKSNSE




Sequences for fold_k401_adp_tub_model_0:
ATP/ADP of interest: RRPQTSSGKTHNH
beta-tubulin: KKNVHEKKTHRDR
kinesin B: KNLAEWKRYEKNLKVLEELWRQIMLAS
alpha-tubulin: SKVAEKNKSNS




Sequences for fold_k401_adp_model_0:
ATP/ADP of interest: RRPQTSSGKTH
kinesin B: KNLTAEWKRYEKNLKVLEELWRQIML




{'fold_k401_atp_model_0': {'ATP/ADP of interest': 'RRPQTSSGKTHS',
  'kinesin B': 'KNLTAEWKRYEKNLKKVLEELWRQIMLS'},
 'fold_k401_atp_tub_model_0': {'ATP/ADP of interest': 'RRPQTSSGKTHNSSG',
  'beta-tubulin': 'KKNVHEDKRNTHRDR',
  'kinesin B': 'KNLAEWKRYEKNLKVLEELWRQIML',
  'alpha-tubulin': 'SKVAELKNKSNSE'},
 'fold_k401_adp_tub_model_0': {'ATP/ADP of interest': 'RRPQTSSGKTHNH',
  'beta-tubulin': 'KKNVHEKKTHRDR',
  'kinesin B': 'KNLAEWKRYEKNLKVLEELWRQIMLAS',
  'alpha-tubulin': 'SKVAEKNKSNS'},
 'fold_k401_adp_model_0': {'ATP/ADP of interest': 'RRPQTSSGKTH',
  'kinesin B': 'KNLTAEWKRYEKNLKVLEELWRQIML'}}