In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

In [3]:
# Function to read and process the DataFrame
def process_dataframe(csv_file_path):
    df = pd.read_csv(csv_file_path)

    # Map interacting_resn to single-letter codes for better readability
    aa_dict = {
        'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G',
        'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
        'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
    }

    # Amino acid classification
    aa_classification = {
        'A': 'nonpolar', 'R': 'positively charged', 'N': 'polar uncharged', 'D': 'negatively charged', 'C': 'polar uncharged', 
        'Q': 'polar uncharged', 'E': 'negatively charged', 'G': 'nonpolar', 'H': 'positively charged', 'I': 'nonpolar', 
        'L': 'nonpolar', 'K': 'positively charged', 'M': 'nonpolar', 'F': 'nonpolar', 'P': 'nonpolar', 'S': 'polar uncharged', 
        'T': 'polar uncharged', 'W': 'nonpolar', 'Y': 'polar uncharged', 'V': 'nonpolar'
    }

    # Replace chain letters with descriptive names
    chain_map = {
        'B': 'kinesin B',
        'C': 'alpha-tubulin',
        'D': 'beta-tubulin',
        'E': 'ATP',
        'F': 'ATP'
    }

    df['residue_one_letter'] = df['resn'].map(aa_dict)
    df['residue'] = df['resi'].astype(str) + df['residue_one_letter']
    df['interacting_chain'] = df['interacting_chain'].map(chain_map)
    df['aa_classification'] = df['residue_one_letter'].map(aa_classification)
    
    return df



# Function to generate plots
def create_plots(df, distance_threshold, xaxis_range=None, yaxis_range=None, interacting_chains=None, files_to_plot=None, chimeras=False):
    unique_files = sorted(df['file'].unique())  # Sort the unique files alphabetically
    plots = []

    # Define a fixed color mapping for interacting chains
    color_map = {
        'kinesin B': '#1f77b4',  # blue
        'alpha-tubulin': '#ff7f0e',  # orange
        'beta-tubulin': '#2ca02c',  # green
        'ATP': '#d62728',  # red
    }

    # Define a fixed order for the legend
    legend_order = ['kinesin B', 'alpha-tubulin', 'beta-tubulin', 'ATP']

    # Define marker shapes for amino acid classifications
    shape_map = {
        'nonpolar': 'circle',
        'polar uncharged': 'square',
        'positively charged': 'triangle-up',
        'negatively charged': 'triangle-down'
    }

    # Filter the files to plot if specified
    if files_to_plot:
        unique_files = [file for file in unique_files if any(file.startswith(prefix) for prefix in files_to_plot)]

    for file in unique_files:
        df_file = df[df['file'] == file]

        # Filter to get the shortest distances for each interaction
        df_filtered = df_file.loc[df_file[df_file['chain'] == 'A'].groupby(['resi', 'interacting_chain'])['distance (angstroms)'].idxmin()]

        # Filter by distance threshold
        df_filtered = df_filtered[df_filtered['distance (angstroms)'] <= distance_threshold]


        # Filter by interacting chains if specified
        if interacting_chains:
            df_filtered = df_filtered[df_filtered['interacting_chain'].isin(interacting_chains)]

        # Create the interactive plot with consistent color mapping and shape mapping
        fig = px.scatter(df_filtered, x='resi', y='distance (angstroms)', color='interacting_chain', text='residue_one_letter',
                         symbol='aa_classification', symbol_map=shape_map,
                         color_discrete_map=color_map,
                         category_orders={'interacting_chain': legend_order},
                         labels={'resi': 'Residue Index', 'distance (angstroms)': 'Distance (angstroms)', 'interacting_chain': 'Interacting Chain'},
                         title=f"Interactions for {file}",
                         hover_data=['interacting_resinumber', 'interactingresi_oneletter'])

        # Update layout for better visualization
        fig.update_traces(textposition='top center', marker=dict(size=10, opacity=0.6))
        fig.update_layout(
            legend_title_text='Interacting Chain',
            yaxis_range=yaxis_range if yaxis_range else [0.1, distance_threshold + 0.5]  # Set y-axis range based on input or default
        )

        # Set x-axis range if specified
        if xaxis_range:
            fig.update_layout(xaxis_range=xaxis_range)

        # Add more ticks to the x-axis
        fig.update_xaxes(dtick=20)

        # Add background color shapes based on the first letter of the file name if chimeras is True
        if chimeras:
            first_letter = file[0]
            y_max = yaxis_range[1] if yaxis_range else distance_threshold + 0.5
            if first_letter == 'A':
                fig.add_shape(type="rect", x0=0, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'B':
                fig.add_shape(type="rect", x0=0, x1=174, y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=174, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'C':
                fig.add_shape(type="rect", x0=0, x1=178, y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=178, x1=333, y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=333, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'D':
                fig.add_shape(type="rect", x0=0, x1=329, y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=329, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'E':
                fig.add_shape(type="rect", x0=0, x1=333, y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=333, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'F':
                fig.add_shape(type="rect", x0=0, x1=174, y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=174, x1=329, y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=329, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'G':
                fig.add_shape(type="rect", x0=0, x1=178, y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=178, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'H':
                fig.add_shape(type="rect", x0=0, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)

        # Define plot size
        fig.update_layout(autosize=False, width=1600, height=600)

        # Show the plot
        fig.show()

        plots.append(fig)

    return plots

In [4]:
# Define the path to the CSV file
csv_file_path = "../../data/3d_predictions/chimeras_5seeds/interactions_chimeras.csv"

# Process the DataFrame
df_processed = process_dataframe(csv_file_path)

# Define the threshold value for distance filtering
distance_threshold = 4

# Create and display the plots with specified x-axis range, interacting chains, and files to plot
xaxis_range = [0, 600]  # Example x-axis range
yaxis_range = [1.5, distance_threshold + 0.5]
interacting_chains = ['ATP', 'beta-tubulin', 'alpha-tubulin', 'kinesin B']  # Example interacting chains to display

files_to_plot =  ['A', 'H']  # Example files to plot

plots = create_plots(df_processed, distance_threshold, xaxis_range=xaxis_range, yaxis_range=yaxis_range, interacting_chains=interacting_chains[:4], files_to_plot=files_to_plot, chimeras=True)


In [5]:
def process_and_save_interactions(input_csv_path, output_csv_path):
    df = pd.read_csv(input_csv_path)

    df = df.loc[df['chain'] == 'A']

    # Select the shortest distance interaction for each resi and interacting_resinumber
    df = df.loc[df.groupby(['file', 'resi', 'interacting_resinumber'])['distance (angstroms)'].idxmin()]

    df['seed'] = df['file'].str.extract(r'_seed(\d+)')[0].str.upper()
    df['file'] = df['file'].str.replace(r'_seed\d+_model_0', '', regex=True).str.strip()

    # Create a column with a tuple that includes resi and interacting_resinumber for each unique
    df['resi_interacting_resinumber'] = df[['resi', 'interacting_resinumber']].apply(lambda x: (x['resi'], x['interacting_resinumber']), axis=1)

    # Process each unique file separately
    unique_files = df['file'].unique()
    dfs = []

    for file in unique_files:
        df_file = df[df['file'] == file]

        # Find the common interactions across all seeds of the current file
        common_interactions = df_file.groupby('seed')['resi_interacting_resinumber'].apply(set).reset_index()
        common_interactions = set.intersection(*common_interactions['resi_interacting_resinumber'])

        # Filter the DataFrame to keep only the common interactions
        df_file = df_file[df_file['resi_interacting_resinumber'].isin(common_interactions)]

        # Compute the average distance for each interacting pair across the seeds
        df_file = df_file.groupby(['file', 'chain', 'resi', 'resn', 'interacting_chain', 'interacting_resn', 'interacting_resinumber', 'residue_one_letter', 'interactingresi_oneletter'])['distance (angstroms)'].mean().reset_index()

        dfs.append(df_file)

    # Merge all the DataFrames
    df_final = pd.concat(dfs, ignore_index=True)

    # Save the final DataFrame
    df_final.to_csv(output_csv_path, index=False)

    chain_map = {
        'B': 'kinesin B',
        'C': 'alpha-tubulin',
        'D': 'beta-tubulin',
        'E': 'ATP',
        'F': 'ATP'
    }

    df_final['interacting_chain'] = df_final['interacting_chain'].map(chain_map)
    
    return df_final

  

# Example usage
csv_file_path = "../../data/3d_predictions/chimeras_5seeds/interactions_chimeras.csv"
output_csv_path = "../../data/3d_predictions/chimeras_5seeds/average_interactions_chimeras.csv"
df_average = process_and_save_interactions(csv_file_path, output_csv_path)
df_average



Unnamed: 0,file,chain,resi,resn,interacting_chain,interacting_resn,interacting_resinumber,residue_one_letter,interactingresi_oneletter,distance (angstroms)
0,A,A,18,ARG,ATP,ATP,1,R,?,3.804740
1,A,A,19,PHE,ATP,ATP,1,F,?,4.696334
2,A,A,20,ARG,ATP,ATP,1,R,?,3.641895
3,A,A,21,PRO,ATP,ATP,1,P,?,3.541921
4,A,A,23,ASN,ATP,ATP,1,N,?,5.803859
...,...,...,...,...,...,...,...,...,...,...
4732,H,A,579,VAL,kinesin B,ARG,566,V,R,4.842717
4733,H,A,579,VAL,kinesin B,LEU,563,V,L,4.625987
4734,H,A,580,PRO,kinesin B,ARG,566,P,R,4.409354
4735,H,A,580,PRO,kinesin B,GLN,559,P,Q,5.759448


In [9]:
unique_files = df_average['file'].unique()

color_map = {
    'ATP': 'red',
    'alpha-tubulin': 'orange',
    'beta-tubulin': 'green',
    'kinesin B': 'lightblue'
}

distance_threshold = 4.5

for file in unique_files:
    print(f"File: {file}")
    interacting_chains = df_average[df_average['file'] == file]['interacting_chain'].unique()
    
    if file == 'A':
        print("color blue, chain A")
        print("color blue, chain B")
    elif file == 'B':
        print("color blue, chain A")
        print("color blue, chain B")
        print("color yelloworange, chain A and resi 1-175")
    elif file == 'C':
        print("color blue, chain A")
        print("color blue, chain B")
        print("color yelloworange, chain A and resi 180-334")
    elif file == 'D':
        print("color blue, chain A")
        print("color blue, chain B")
        print("color yelloworange, chain A and resi 1-330")
    elif file == 'E':
        print("color blue, chain A")
        print("color blue, chain B")
        print("color yelloworange, chain A and resi 335-596")
    elif file == 'F':
        print("color yelloworange, chain A")
        print("color yelloworange, chain B")
        print("color blue, chain A and resi 176-330")
    elif file == 'G':
        print("color blue, chain A")
        print("color blue, chain B")
        print("color yelloworange, chain A and resi 180-596")
    elif file == 'H':
        print("color yelloworange, chain A")
        print("color yelloworange, chain B")
    
    print("color gray, chain C")
    print("color gray10, chain D")
    print("color magenta, resn ATP+ADP+GTP+GDP")
    print("color magenta, elem MG")
    for chain in interacting_chains:
        closest_interactions = df_average[(df_average['file'] == file) & (df_average['interacting_chain'] == chain) & (df_average['distance (angstroms)'] <= distance_threshold)]
        closest_interactions = closest_interactions.loc[closest_interactions.groupby('resi')['distance (angstroms)'].idxmin()]
        
        resi_positions = closest_interactions['resi'].unique()
        interacting_resi_positions = closest_interactions['interacting_resinumber'].unique()
        
        resi_positions_str = '+'.join(map(str, resi_positions))
        interacting_resi_positions_str = '+'.join(map(str, interacting_resi_positions))
        
        color = color_map.get(chain)
        if color and resi_positions_str:
            print(f"color {color}, chain A and resi {resi_positions_str}")
            print(f"show sticks, chain A and resi {resi_positions_str}")
            # print(f"show surface, chain A and resi {resi_positions_str}")
        
        if chain == 'alpha-tubulin' and interacting_resi_positions_str:
            print(f"show cartoon, chain C and resi {interacting_resi_positions_str}")
        elif chain == 'beta-tubulin' and interacting_resi_positions_str:
            print(f"show cartoon, chain D and resi {interacting_resi_positions_str}")

File: A
color blue, chain A
color blue, chain B
color gray, chain C
color gray10, chain D
color magenta, resn ATP+ADP+GTP+GDP
color magenta, elem MG
color red, chain A and resi 18+20+21+61+93+94+95+96+97+98+100+205+207+208+239+240+241
show sticks, chain A and resi 18+20+21+61+93+94+95+96+97+98+100+205+207+208+239+240+241
color orange, chain A and resi 242+243+244+245+250+251+255+259+262+263+266+270+318+319
show sticks, chain A and resi 242+243+244+245+250+251+255+259+262+263+266+270+318+319
show cartoon, chain C and resi 414+420+412+112+108+109+409+415+402
color green, chain A and resi 148+159+162+163+164+165+166+168+281+282+286+287+329
show sticks, chain A and resi 148+159+162+163+164+165+166+168+281+282+286+287+329
show cartoon, chain D and resi 157+410+413+406+412+420+424+417+262+436
color lightblue, chain A and resi 166+167+343+344+345+346+348+349+351+352+355+356+358+359+362+363+365+366+369+370+372+373+374+376+377+387+388+390+393
show sticks, chain A and resi 166+167+343+344+345+34

In [13]:
# Define the path to the CSV file
csv_file_path = "../../data/3d_predictions/chimeras_5seeds/average_interactions_chimeras.csv"

# Process the DataFrame
df_processed = process_dataframe(csv_file_path)

# Define the threshold value for distance filtering
distance_threshold = 4.5
# Create and display the plots with specified x-axis range, interacting chains, and files to plot
xaxis_range = [0, 600]  # Example x-axis range
yaxis_range = [1.5, distance_threshold + 0.5]
interacting_chains = ['ATP', 'beta-tubulin', 'alpha-tubulin', 'kinesin B']  # Example interacting chains to display

files_to_plot = None#['A', 'H']  # Example files to plot

plots = create_plots(df_processed, distance_threshold, xaxis_range=xaxis_range, yaxis_range=yaxis_range, interacting_chains=interacting_chains[:4], files_to_plot=files_to_plot, chimeras=True)


In [None]:
# def process_and_save_interactions(input_csv_path, output_csv_path):
#     df = pd.read_csv(input_csv_path)

#     df = df.loc[df['chain'] == 'A']

#     # Select the shortest distance interaction for each resi and interacting_resinumber
#     df = df.loc[df.groupby(['file', 'resi', 'interacting_resinumber'])['distance (angstroms)'].idxmin()]

#     df['seed'] = df['file'].str.extract(r'_seed(\d+)')[0].str.upper()
#     df['file'] = df['file'].str.replace(r'_seed\d+_model_0', '', regex=True).str.strip()

#     # Create a column with a tuple that includes resi and interacting_resinumber for each unique
#     df['resi_interacting_resinumber'] = df[['resi', 'interacting_resinumber']].apply(lambda x: (x['resi'], x['interacting_resinumber']), axis=1)

#     # Process each unique file separately
#     unique_files = df['file'].unique()
#     dfs = []

#     for file in unique_files:
#         df_file = df[df['file'] == file]

#         # Find the common interactions across all seeds of the current file
#         common_interactions = df_file.groupby('seed')['resi_interacting_resinumber'].apply(set).reset_index()
#         common_interactions = set.intersection(*common_interactions['resi_interacting_resinumber'])

#         # Filter the DataFrame to keep only the common interactions
#         df_file = df_file[df_file['resi_interacting_resinumber'].isin(common_interactions)]

#         # Compute the average distance for each interacting pair across the seeds
#         df_file = df_file.groupby(['file', 'chain', 'resi', 'resn', 'interacting_chain', 'interacting_resn', 'interacting_resinumber', 'residue_one_letter', 'interactingresi_oneletter'])['distance (angstroms)'].mean().reset_index()

#         dfs.append(df_file)

#     # Merge all the DataFrames
#     df_final = pd.concat(dfs, ignore_index=True)

#     # Now, we keep only those residues that interact with two or more unique chains
#     chain_counts = df_final.groupby('resi')['interacting_resinumber'].nunique()
#     dual_interactions = chain_counts[chain_counts >= 2].index

#     # Filter the DataFrame to keep only residues with dual interactions
#     df_final = df_final[df_final['resi'].isin(dual_interactions)]

#     # Additionally, ensure that the final df only includes interactions with two or more chains
#     df_final = df_final.groupby('resi').filter(lambda x: x['interacting_chain'].nunique() >= 2)

#     # Save the final DataFrame
#     df_final.to_csv(output_csv_path, index=False)

#     # Optional: Mapping chain names
#     chain_map = {
#         'B': 'kinesin B',
#         'C': 'alpha-tubulin',
#         'D': 'beta-tubulin',
#         'E': 'ATP',
#         'F': 'ATP'
#     }

#     df_final['interacting_chain'] = df_final['interacting_chain'].map(chain_map)
    
#     return df_final

# # Example usage
# csv_file_path = "../../data/3d_predictions/chimeras_5seeds/interactions_chimeras.csv"
# output_csv_path = "../../data/3d_predictions/chimeras_5seeds/average_dual_interactions_chimeras.csv"
# df_dual_interactions = process_and_save_interactions(csv_file_path, output_csv_path)
# df_dual_interactions


In [None]:
# # Define the path to the CSV file
# csv_file_path = "../../data/3d_predictions/chimeras_5seeds/average_dual_interactions_chimeras.csv"

# # Process the DataFrame
# df_processed = process_dataframe(csv_file_path)

# # Define the threshold value for distance filtering
# distance_threshold = 7
# # Create and display the plots with specified x-axis range, interacting chains, and files to plot
# xaxis_range = [0, 600]  # Example x-axis range
# yaxis_range = [1.5, distance_threshold + 0.5]
# interacting_chains = ['ATP', 'beta-tubulin', 'alpha-tubulin', 'kinesin B']  # Example interacting chains to display

# files_to_plot = None #['A', 'C', 'H']  # Example files to plot

# plots = create_plots(df_processed, distance_threshold, xaxis_range=xaxis_range, yaxis_range=yaxis_range, interacting_chains=interacting_chains[:4], files_to_plot=files_to_plot, chimeras=True)


In [1]:
import plotly.express as px
from ipywidgets import interact, FloatSlider, IntRangeSlider
import numpy as np


# Function to update the plot based on the distance threshold and resi range
def update_plot(distance_threshold, resi_range):
    

    # Define marker_symbols and color_map
    marker_symbols = {
        'ATP': 'circle',
        'beta-tubulin': 'square',
        'alpha-tubulin': 'diamond',
        'kinesin B': 'cross'
    }

    color_map = {
        'kinesin B': '#1f77b4',  # blue
        'alpha-tubulin': '#ff7f0e',  # orange
        'beta-tubulin': '#2ca02c',  # green
        'ATP': '#d62728',  # red
    }

    df_filtered = df_average[(df_average['distance (angstroms)'] <= distance_threshold) & 
                             (df_average['resi'] >= resi_range[0]) & 
                             (df_average['resi'] <= resi_range[1])]
    df_avg_distance = df_filtered.groupby(['file', 'interacting_chain']).agg({
        'distance (angstroms)': 'mean',
        'resi': 'count',
        'interactingresi_oneletter': lambda x: ', '.join(sorted(set(x)))
    }).reset_index()
    df_avg_distance = df_avg_distance.rename(columns={'resi': 'residues'})

    # Sort the DataFrame by 'file' column to ensure alphabetical order
    df_avg_distance = df_avg_distance.sort_values('file')

    # Add jitter to x coordinates only
    np.random.seed(42)  # Set a seed for consistency
    jitter_x = np.random.uniform(-0.1, 0.2, size=len(df_avg_distance))
    df_avg_distance['x_jittered'] = df_avg_distance['file'].astype('category').cat.codes + jitter_x

    fig = px.scatter(df_avg_distance, x='x_jittered', y='distance (angstroms)', 
                     color='interacting_chain', symbol='interacting_chain',
                     symbol_map=marker_symbols, color_discrete_map=color_map,
                     category_orders={"file": sorted(df_avg_distance['file'].unique())},
                     hover_data=['residues', 'file', 'distance (angstroms)', 'interactingresi_oneletter'],
                     text='residues')  # Add 'residues' as text

    fig.update_traces(marker=dict(size=10),
                      textposition='middle right')  # Position the text to the right of the markers
    fig.update_layout(title='Average Distance by File and Interacting Chain',
                      xaxis_title='File',
                      yaxis_title='Average Distance (Angstroms)',
                      height=800,  # Increased height to make the plot taller
                      xaxis=dict(tickmode='array', tickvals=list(range(len(df_avg_distance['file'].unique()))), ticktext=sorted(df_avg_distance['file'].unique())),
                    #   yaxis=dict(range=[3, 4])
                      )  
    return fig

# Create interactive sliders
interact(update_plot, 
         distance_threshold=FloatSlider(value=4.2, min=0, max=10, step=0.01, description='Distance Threshold'),
         resi_range=IntRangeSlider(value=[1, 600], min=1, max=600, step=1, description='Residue Range'))


interactive(children=(FloatSlider(value=4.2, description='Distance Threshold', max=10.0, step=0.01), IntRangeS…

<function __main__.update_plot(distance_threshold, resi_range)>

- from 318 to 600, B and F don't have interaction with alpha tubulin
- from 1-141, F is the first to show interaction with alpha tubulin
- from 329 to 401 you can see how A has the most interactions with itself, and is most flexible than B, C and D. Here E and G are farther away. HF and G are about the same as A but less interactions
- if set to largest distance (7A), you can see how B and F have the least interactions with beta tubulin, which makes them interact closer.

## Now let's take a look at species seeds

In [None]:
# Function to generate plots without background color shapes
def create_plots(df, distance_threshold, xaxis_range=None, yaxis_range=None, interacting_chains=None, files_to_plot=None):
    unique_files = sorted(df['file'].unique())  # Sort the unique files alphabetically
    plots = []

    # Define a fixed color mapping for interacting chains
    color_map = {
        'kinesin B': '#1f77b4',  # blue
        'alpha-tubulin': '#ff7f0e',  # orange
        'beta-tubulin': '#2ca02c',  # green
        'ATP': '#d62728',  # red
    }

    # Define a fixed order for the legend
    legend_order = ['kinesin B', 'alpha-tubulin', 'beta-tubulin', 'ATP']

    # Define marker shapes for amino acid classifications
    shape_map = {
        'nonpolar': 'circle',
        'polar uncharged': 'square',
        'positively charged': 'triangle-up',
        'negatively charged': 'triangle-down'
    }

    # Filter the files to plot if specified
    if files_to_plot:
        unique_files = [file for file in unique_files if any(file.startswith(prefix) for prefix in files_to_plot)]

    for file in unique_files:
        df_file = df[df['file'] == file]

        # Filter to get the shortest distances for each interaction
        df_filtered = df_file.loc[df_file[df_file['chain'] == 'A'].groupby(['resi', 'interacting_chain'])['distance (angstroms)'].idxmin()]

        # Filter by distance threshold
        df_filtered = df_filtered[df_filtered['distance (angstroms)'] <= distance_threshold]

        # Filter by interacting chains if specified
        if interacting_chains:
            df_filtered = df_filtered[df_filtered['interacting_chain'].isin(interacting_chains)]

        # Create the interactive plot with consistent color mapping and shape mapping
        fig = px.scatter(df_filtered, x='resi', y='distance (angstroms)', color='interacting_chain', text='residue_one_letter',
                         symbol='aa_classification', symbol_map=shape_map,
                         color_discrete_map=color_map,
                         category_orders={'interacting_chain': legend_order},
                         labels={'resi': 'Residue Index', 'distance (angstroms)': 'Distance (angstroms)', 'interacting_chain': 'Interacting Chain'},
                         title=f"Interactions for {file}")

        # Update layout for better visualization
        fig.update_traces(textposition='top center', marker=dict(size=10, opacity=0.6))
        fig.update_layout(
            legend_title_text='Interacting Chain',
            yaxis_range=yaxis_range if yaxis_range else [0.1, distance_threshold + 0.5]  # Set y-axis range based on input or default
        )

        # Set x-axis range if specified
        if xaxis_range:
            fig.update_layout(xaxis_range=xaxis_range)

        # Add more ticks to the x-axis
        fig.update_xaxes(dtick=20)

        # Define plot size
        fig.update_layout(autosize=False, width=1600, height=600)

        # Show the plot
        fig.show()

        plots.append(fig)

    return plots

In [None]:
# Define the path to the CSV file
csv_file_path = "../../data/3d_predictions/species_seeds/interactions_chimeras.csv"

# Process the DataFrame
df_processed = process_dataframe(csv_file_path)

# Define the threshold value for distance filtering
distance_threshold = 4.2

# Create and display the plots with specified x-axis range, interacting chains, and files to plot
xaxis_range = [0, 600]  # Example x-axis range
yaxis_range = [1.5, distance_threshold + 0.5]
interacting_chains = ['ATP', 'beta-tubulin', 'alpha-tubulin', 'kinesin B']  # Example interacting chains to display

files_to_plot =  None#['A', 'H']  # Example files to plot

plots = create_plots(df_processed, distance_threshold, xaxis_range=xaxis_range, yaxis_range=yaxis_range, interacting_chains=interacting_chains[:4], files_to_plot=files_to_plot)


In [4]:
def process_and_save_interactions(input_csv_path, output_csv_path):
    df = pd.read_csv(input_csv_path)

    df = df.loc[df['chain'] == 'A']

    # Select the shortest distance interaction for each resi and interacting_resinumber
    df = df.loc[df.groupby(['file', 'resi', 'interacting_resinumber'])['distance (angstroms)'].idxmin()]

    df['seed'] = df['file'].str.extract(r'_seed(\d+)')[0].str.upper()
    df['file'] = df['file'].str.replace(r'_seed\d+_model_0', '', regex=True).str.strip()

    # Create a column with a tuple that includes resi and interacting_resinumber for each unique
    df['resi_interacting_resinumber'] = df[['resi', 'interacting_resinumber']].apply(lambda x: (x['resi'], x['interacting_resinumber']), axis=1)

    # Process each unique file separately
    unique_files = df['file'].unique()
    dfs = []

    for file in unique_files:
        df_file = df[df['file'] == file]

        # Find the common interactions across all seeds of the current file
        common_interactions = df_file.groupby('seed')['resi_interacting_resinumber'].apply(set).reset_index()
        common_interactions = set.intersection(*common_interactions['resi_interacting_resinumber'])

        # Filter the DataFrame to keep only the common interactions
        df_file = df_file[df_file['resi_interacting_resinumber'].isin(common_interactions)]

        # Compute the average distance for each interacting pair across the seeds
        df_file = df_file.groupby(['file', 'chain', 'resi', 'resn', 'interacting_chain', 'interacting_resn', 'interacting_resinumber', 'residue_one_letter', 'interactingresi_oneletter'])['distance (angstroms)'].mean().reset_index()

        dfs.append(df_file)

    # Merge all the DataFrames
    df_final = pd.concat(dfs, ignore_index=True)

    # Save the final DataFrame
    df_final.to_csv(output_csv_path, index=False)

    chain_map = {
        'B': 'kinesin B',
        'C': 'alpha-tubulin',
        'D': 'beta-tubulin',
        'E': 'ATP',
        'F': 'ATP'
    }

    df_final['interacting_chain'] = df_final['interacting_chain'].map(chain_map)
    
    return df_final

  

# Example usage
csv_file_path = "../../data/3d_predictions/species_seeds/interactions_chimeras.csv"
output_csv_path = "../../data/3d_predictions/species_seeds/average_interactions_species.csv"
df_average = process_and_save_interactions(csv_file_path, output_csv_path)
df_average

Unnamed: 0,file,chain,resi,resn,interacting_chain,interacting_resn,interacting_resinumber,residue_one_letter,interactingresi_oneletter,distance (angstroms)
0,Acsu2,A,9,ARG,ATP,ATP,1,R,?,3.833171
1,Acsu2,A,10,PHE,ATP,ATP,1,F,?,4.914591
2,Acsu2,A,11,ARG,ATP,ATP,1,R,?,3.733138
3,Acsu2,A,12,PRO,ATP,ATP,1,P,?,3.663855
4,Acsu2,A,14,ASN,ATP,ATP,1,N,?,5.810327
...,...,...,...,...,...,...,...,...,...,...
6047,Tila,A,505,ILE,kinesin B,LEU,501,I,L,2.817382
6048,Tila,A,505,ILE,kinesin B,LYS,504,I,K,6.127687
6049,Tila,A,508,ASP,kinesin B,ILE,505,D,I,5.939484
6050,Tila,A,508,ASP,kinesin B,TYR,509,D,Y,3.978279


In [14]:
# Define the path to the CSV file
csv_file_path = "../../data/3d_predictions/species_seeds/average_interactions_species.csv"

# Process the DataFrame
df_processed = process_dataframe(csv_file_path)

# Define the threshold value for distance filtering
distance_threshold = 4.5

# Create and display the plots with specified x-axis range, interacting chains, and files to plot
xaxis_range = [0, 600]  # Example x-axis range
yaxis_range = [1.5, distance_threshold + 0.5]
interacting_chains = ['ATP', 'beta-tubulin', 'alpha-tubulin', 'kinesin B']  # Example interacting chains to display

files_to_plot = None#['Acsu2', 'Acsu', 'Heal'] # Example files to plot 

plots = create_plots(df_processed, distance_threshold, xaxis_range=xaxis_range, yaxis_range=yaxis_range, interacting_chains=interacting_chains[:4], files_to_plot=files_to_plot)
