In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [10]:
# Function to read and process the DataFrame
def process_dataframe(csv_file_path):
    df = pd.read_csv(csv_file_path)

    # Map interacting_resn to single-letter codes for better readability
    aa_dict = {
        'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G',
        'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
        'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
    }

    # Amino acid classification
    aa_classification = {
        'A': 'nonpolar', 'R': 'positively charged', 'N': 'polar uncharged', 'D': 'negatively charged', 'C': 'polar uncharged', 
        'Q': 'polar uncharged', 'E': 'negatively charged', 'G': 'nonpolar', 'H': 'positively charged', 'I': 'nonpolar', 
        'L': 'nonpolar', 'K': 'positively charged', 'M': 'nonpolar', 'F': 'nonpolar', 'P': 'nonpolar', 'S': 'polar uncharged', 
        'T': 'polar uncharged', 'W': 'nonpolar', 'Y': 'polar uncharged', 'V': 'nonpolar'
    }

    # Replace chain letters with descriptive names
    chain_map = {
        'B': 'kinesin B',
        'C': 'alpha-tubulin',
        'D': 'beta-tubulin',
        'E': 'ATP',
        'F': 'ATP'
    }

    df['residue_one_letter'] = df['resn'].map(aa_dict)
    df['residue'] = df['resi'].astype(str) + df['residue_one_letter']
    df['interacting_chain'] = df['interacting_chain'].map(chain_map)
    df['aa_classification'] = df['residue_one_letter'].map(aa_classification)
    
    return df



# Function to generate plots
def create_plots(df, distance_threshold, xaxis_range=None, yaxis_range=None, interacting_chains=None, files_to_plot=None, chimeras=False):
    unique_files = sorted(df['file'].unique())  # Sort the unique files alphabetically
    plots = []

    # Define a fixed color mapping for interacting chains
    color_map = {
        'kinesin B': '#1f77b4',  # blue
        'alpha-tubulin': '#ff7f0e',  # orange
        'beta-tubulin': '#2ca02c',  # green
        'ATP': '#d62728',  # red
    }

    # Define a fixed order for the legend
    legend_order = ['kinesin B', 'alpha-tubulin', 'beta-tubulin', 'ATP']

    # Define marker shapes for amino acid classifications
    shape_map = {
        'nonpolar': 'circle',
        'polar uncharged': 'square',
        'positively charged': 'triangle-up',
        'negatively charged': 'triangle-down'
    }

    # Filter the files to plot if specified
    if files_to_plot:
        unique_files = [file for file in unique_files if any(file.startswith(prefix) for prefix in files_to_plot)]

    for file in unique_files:
        df_file = df[df['file'] == file]

        # Filter to get the shortest distances for each interaction
        df_filtered = df_file.loc[df_file[df_file['chain'] == 'A'].groupby(['resi', 'interacting_chain'])['distance (angstroms)'].idxmin()]

        # Filter by distance threshold
        df_filtered = df_filtered[df_filtered['distance (angstroms)'] <= distance_threshold]


        # Filter by interacting chains if specified
        if interacting_chains:
            df_filtered = df_filtered[df_filtered['interacting_chain'].isin(interacting_chains)]

        # Create the interactive plot with consistent color mapping and shape mapping
        fig = px.scatter(df_filtered, x='resi', y='distance (angstroms)', color='interacting_chain', text='residue_one_letter',
                         symbol='aa_classification', symbol_map=shape_map,
                         color_discrete_map=color_map,
                         category_orders={'interacting_chain': legend_order},
                         labels={'resi': 'Residue Index', 'distance (angstroms)': 'Distance (angstroms)', 'interacting_chain': 'Interacting Chain'},
                         title=f"Interactions for {file}",
                         hover_data=['interacting_resinumber', 'interactingresi_oneletter'])

        # Update layout for better visualization
        fig.update_traces(textposition='top center', marker=dict(size=10, opacity=0.6))
        fig.update_layout(
            legend_title_text='Interacting Chain',
            yaxis_range=yaxis_range if yaxis_range else [0.1, distance_threshold + 0.5]  # Set y-axis range based on input or default
        )

        # Set x-axis range if specified
        if xaxis_range:
            fig.update_layout(xaxis_range=xaxis_range)

        # Add more ticks to the x-axis
        fig.update_xaxes(dtick=20)

        # Add background color shapes based on the first letter of the file name if chimeras is True
        if chimeras:
            first_letter = file[0]
            y_max = yaxis_range[1] if yaxis_range else distance_threshold + 0.5
            if first_letter == 'A':
                fig.add_shape(type="rect", x0=0, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'B':
                fig.add_shape(type="rect", x0=0, x1=174, y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=174, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'C':
                fig.add_shape(type="rect", x0=0, x1=178, y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=178, x1=333, y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=333, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'D':
                fig.add_shape(type="rect", x0=0, x1=329, y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=329, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'E':
                fig.add_shape(type="rect", x0=0, x1=333, y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=333, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'F':
                fig.add_shape(type="rect", x0=0, x1=174, y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=174, x1=329, y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=329, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'G':
                fig.add_shape(type="rect", x0=0, x1=178, y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=178, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'H':
                fig.add_shape(type="rect", x0=0, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)

        # Define plot size
        fig.update_layout(autosize=False, width=1600, height=600)

        # Show the plot
        fig.show()

        plots.append(fig)

    return plots

In [11]:
# Define the path to the CSV file
csv_file_path = "../../data/3d_predictions/chimeras_5seeds/interactions_chimeras.csv"

# Process the DataFrame
df_processed = process_dataframe(csv_file_path)

# Define the threshold value for distance filtering
distance_threshold = 7

# Create and display the plots with specified x-axis range, interacting chains, and files to plot
xaxis_range = [0, 600]  # Example x-axis range
yaxis_range = [1.5, distance_threshold + 0.5]
interacting_chains = ['ATP', 'beta-tubulin', 'alpha-tubulin', 'kinesin B']  # Example interacting chains to display

files_to_plot =  ['A', 'H']  # Example files to plot

plots = create_plots(df_processed, distance_threshold, xaxis_range=xaxis_range, yaxis_range=yaxis_range, interacting_chains=interacting_chains[:4], files_to_plot=files_to_plot, chimeras=True)


In [12]:
def process_and_save_interactions(input_csv_path, output_csv_path):
    df = pd.read_csv(input_csv_path)

    df = df.loc[df['chain'] == 'A']

    # Select the shortest distance interaction for each resi and interacting_resinumber
    df = df.loc[df.groupby(['file', 'resi', 'interacting_resinumber'])['distance (angstroms)'].idxmin()]

    df['seed'] = df['file'].str.extract(r'_seed(\d+)')[0].str.upper()
    df['file'] = df['file'].str.replace(r'_seed\d+_model_0', '', regex=True).str.strip()

    # Create a column with a tuple that includes resi and interacting_resinumber for each unique
    df['resi_interacting_resinumber'] = df[['resi', 'interacting_resinumber']].apply(lambda x: (x['resi'], x['interacting_resinumber']), axis=1)

    # Process each unique file separately
    unique_files = df['file'].unique()
    dfs = []

    for file in unique_files:
        df_file = df[df['file'] == file]

        # Find the common interactions across all seeds of the current file
        common_interactions = df_file.groupby('seed')['resi_interacting_resinumber'].apply(set).reset_index()
        common_interactions = set.intersection(*common_interactions['resi_interacting_resinumber'])

        # Filter the DataFrame to keep only the common interactions
        df_file = df_file[df_file['resi_interacting_resinumber'].isin(common_interactions)]

        # Compute the average distance for each interacting pair across the seeds
        df_file = df_file.groupby(['file', 'chain', 'resi', 'resn', 'interacting_chain', 'interacting_resn', 'interacting_resinumber', 'residue_one_letter', 'interactingresi_oneletter'])['distance (angstroms)'].mean().reset_index()

        dfs.append(df_file)

    # Merge all the DataFrames
    df_final = pd.concat(dfs, ignore_index=True)

    # Save the final DataFrame
    df_final.to_csv(output_csv_path, index=False)

    chain_map = {
        'B': 'kinesin B',
        'C': 'alpha-tubulin',
        'D': 'beta-tubulin',
        'E': 'ATP',
        'F': 'ATP'
    }

    df_final['interacting_chain'] = df_final['interacting_chain'].map(chain_map)
    
    return df_final

  

# Example usage
csv_file_path = "../../data/3d_predictions/chimeras_5seeds/interactions_chimeras.csv"
output_csv_path = "../../data/3d_predictions/chimeras_5seeds/average_interactions_chimeras.csv"
df_average = process_and_save_interactions(csv_file_path, output_csv_path)
df_average



Unnamed: 0,file,chain,resi,resn,interacting_chain,interacting_resn,interacting_resinumber,residue_one_letter,interactingresi_oneletter,distance (angstroms)
0,A,A,18,ARG,ATP,ATP,1,R,?,3.804740
1,A,A,19,PHE,ATP,ATP,1,F,?,4.696334
2,A,A,20,ARG,ATP,ATP,1,R,?,3.641895
3,A,A,21,PRO,ATP,ATP,1,P,?,3.541921
4,A,A,23,ASN,ATP,ATP,1,N,?,5.803859
...,...,...,...,...,...,...,...,...,...,...
4732,H,A,579,VAL,kinesin B,ARG,566,V,R,4.842717
4733,H,A,579,VAL,kinesin B,LEU,563,V,L,4.625987
4734,H,A,580,PRO,kinesin B,ARG,566,P,R,4.409354
4735,H,A,580,PRO,kinesin B,GLN,559,P,Q,5.759448


In [13]:
unique_files = df_average['file'].unique()

color_map = {
    'ATP': 'red',
    'alpha-tubulin': 'orange',
    'beta-tubulin': 'green',
    'kinesin B': 'lightblue'
}

distance_threshold = 6.5

for file in unique_files:
    print(f"File: {file}")
    interacting_chains = df_average[df_average['file'] == file]['interacting_chain'].unique()
    
    if file == 'A':
        print("color blue, chain A")
        print("color blue, chain B")
    elif file == 'B':
        print("color blue, chain A")
        print("color blue, chain B")
        print("color yelloworange, chain A and resi 1-175")
    elif file == 'C':
        print("color blue, chain A")
        print("color blue, chain B")
        print("color yelloworange, chain A and resi 180-334")
    elif file == 'D':
        print("color blue, chain A")
        print("color blue, chain B")
        print("color yelloworange, chain A and resi 1-330")
    elif file == 'E':
        print("color blue, chain A")
        print("color blue, chain B")
        print("color yelloworange, chain A and resi 335-596")
    elif file == 'F':
        print("color yelloworange, chain A")
        print("color yelloworange, chain B")
        print("color blue, chain A and resi 176-330")
    elif file == 'G':
        print("color blue, chain A")
        print("color blue, chain B")
        print("color yelloworange, chain A and resi 180-596")
    elif file == 'H':
        print("color yelloworange, chain A")
        print("color yelloworange, chain B")
    
    print("color gray, chain C")
    print("color gray10, chain D")
    print("color magenta, resn ATP+ADP+GTP+GDP")
    print("color magenta, elem MG")
    for chain in interacting_chains:
        closest_interactions = df_average[(df_average['file'] == file) & (df_average['interacting_chain'] == chain) & (df_average['distance (angstroms)'] <= distance_threshold)]
        closest_interactions = closest_interactions.loc[closest_interactions.groupby('resi')['distance (angstroms)'].idxmin()]
        
        resi_positions = closest_interactions['resi'].unique()
        interacting_resi_positions = closest_interactions['interacting_resinumber'].unique()
        
        resi_positions_str = '+'.join(map(str, resi_positions))
        interacting_resi_positions_str = '+'.join(map(str, interacting_resi_positions))
        
        color = color_map.get(chain)
        if color and resi_positions_str:
            print(f"color {color}, chain A and resi {resi_positions_str}")
            print(f"show sticks, chain A and resi {resi_positions_str}")
            # print(f"show surface, chain A and resi {resi_positions_str}")
        
        if chain == 'alpha-tubulin' and interacting_resi_positions_str:
            print(f"show cartoon, chain C and resi {interacting_resi_positions_str}")
        elif chain == 'beta-tubulin' and interacting_resi_positions_str:
            print(f"show cartoon, chain D and resi {interacting_resi_positions_str}")



File: A
color blue, chain A
color blue, chain B
color gray, chain C
color gray10, chain D
color magenta, resn ATP+ADP+GTP+GDP
color magenta, elem MG
color red, chain A and resi 18+19+20+21+23+60+61+63+65+92+93+94+95+96+97+98+100+101+202+204+205+206+207+208+210+239+240+241+242+243+309
show sticks, chain A and resi 18+19+20+21+23+60+61+63+65+92+93+94+95+96+97+98+100+101+202+204+205+206+207+208+210+239+240+241+242+243+309
color orange, chain A and resi 91+242+243+244+245+246+247+249+250+251+255+256+258+259+260+261+262+263+264+265+266+267+269+270+273+318+319+321+322+329
show sticks, chain A and resi 91+242+243+244+245+246+247+249+250+251+255+256+258+259+260+261+262+263+264+265+266+267+269+270+273+318+319+321+322+329
show cartoon, chain C and resi 414+420+412+108+112+109+409+415+402+423+416
color green, chain A and resi 148+158+159+162+163+164+165+166+167+168+171+173+263+274+279+280+281+282+283+284+286+287+289+292+295+328+329
show sticks, chain A and resi 148+158+159+162+163+164+165+166+167

In [14]:
# Function to generate plots
def create_plots(df, distance_threshold, xaxis_range=None, yaxis_range=None, interacting_chains=None, files_to_plot=None, chimeras=False):
    unique_files = sorted(df['file'].unique())  # Sort the unique files alphabetically
    plots = []

    # Define a fixed color mapping for interacting chains
    color_map = {
        'kinesin B': '#1f77b4',  # blue
        'alpha-tubulin': '#ff7f0e',  # orange
        'beta-tubulin': '#2ca02c',  # green
        'ATP': '#d62728',  # red
    }

    # Define a fixed order for the legend
    legend_order = ['kinesin B', 'alpha-tubulin', 'beta-tubulin', 'ATP']

    # Define marker shapes for amino acid classifications
    shape_map = {
        'nonpolar': 'circle',
        'polar uncharged': 'square',
        'positively charged': 'triangle-up',
        'negatively charged': 'triangle-down'
    }

    # Filter the files to plot if specified
    if files_to_plot:
        unique_files = [file for file in unique_files if any(file.startswith(prefix) for prefix in files_to_plot)]

    for file in unique_files:
        df_file = df[df['file'] == file]

        # Filter to get the shortest distances for each interaction
        df_filtered = df_file.loc[df_file[df_file['chain'] == 'A'].groupby(['resi', 'interacting_chain'])['distance (angstroms)'].idxmin()]

        # Filter by distance threshold
        df_filtered = df_filtered[df_filtered['distance (angstroms)'] <= distance_threshold]

        # Filter by interacting chains if specified
        if interacting_chains:
            df_filtered = df_filtered[df_filtered['interacting_chain'].isin(interacting_chains)]

        # Create the interactive plot with consistent color mapping and shape mapping
        fig = px.scatter(df_filtered, x='resi', y='distance (angstroms)', color='interacting_chain', text='residue_one_letter',
                         symbol='aa_classification', symbol_map=shape_map,
                         color_discrete_map=color_map,
                         category_orders={'interacting_chain': legend_order},
                         labels={'resi': 'Residue Index', 'distance (angstroms)': 'Distance (angstroms)', 'interacting_chain': 'Interacting Chain'},
                         title=f"Interactions for {file}",
                         hover_data=['interacting_resinumber', 'interactingresi_oneletter'])

        # Update layout for better visualization
        fig.update_traces(textposition='middle left', marker=dict(size=15, opacity=0.6), textfont=dict(size=18))  # Increase text font size
        fig.update_layout(
            legend_title_text='Interacting Chain',
            yaxis_range=yaxis_range if yaxis_range else [0.1, distance_threshold + 0.5],  # Set y-axis range based on input or default
            xaxis=dict(tickmode='linear', tick0=0, dtick=50, title_font=dict(size=24), tickfont=dict(size=18)),  # Set x-axis ticks every 50 residues and increase label size
            yaxis=dict(title_font=dict(size=24), tickfont=dict(size=18))  # Increase y-axis label size and tick label size
        )

        # Set x-axis range if specified
        if xaxis_range:
            fig.update_layout(xaxis_range=xaxis_range)

        # Add background color shapes based on the first letter of the file name if chimeras is True
        if chimeras:
            first_letter = file[0]
            y_max = yaxis_range[1] if yaxis_range else distance_threshold + 0.5
            if first_letter == 'A':
                fig.add_shape(type="rect", x0=0, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'B':
                fig.add_shape(type="rect", x0=0, x1=174, y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=174, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'C':
                fig.add_shape(type="rect", x0=0, x1=178, y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=178, x1=333, y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=333, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'D':
                fig.add_shape(type="rect", x0=0, x1=329, y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=329, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'E':
                fig.add_shape(type="rect", x0=0, x1=333, y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=333, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'F':
                fig.add_shape(type="rect", x0=0, x1=174, y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=174, x1=329, y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=329, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'G':
                fig.add_shape(type="rect", x0=0, x1=178, y0=0, y1=y_max,
                              fillcolor="LightSkyBlue", opacity=0.3, layer="below", line_width=0)
                fig.add_shape(type="rect", x0=178, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)
            elif first_letter == 'H':
                fig.add_shape(type="rect", x0=0, x1=xaxis_range[1] if xaxis_range else df['resi'].max(), y0=0, y1=y_max,
                              fillcolor="LightSalmon", opacity=0.3, layer="below", line_width=0)

        # Define plot size
        fig.update_layout(autosize=False, width=1400, height=600)

        # Show the plot
        fig.show()

        plots.append(fig)

    return plots

In [15]:
# Define the path to the CSV file
csv_file_path = "../../data/3d_predictions/chimeras_5seeds/average_interactions_chimeras.csv"

# Process the DataFrame
df_processed = process_dataframe(csv_file_path)

# Define the threshold value for distance filtering
distance_threshold = 5.5
# Create and display the plots with specified x-axis range, interacting chains, and files to plot
xaxis_range = [-2, 405]  # Example x-axis range
yaxis_range = [2.25, distance_threshold + 0.25]
interacting_chains = ['ATP', 'beta-tubulin', 'alpha-tubulin', 'kinesin B']  # Example interacting chains to display

files_to_plot = ['A',  'H']  # Example files to plot

plots = create_plots(df_processed, distance_threshold, xaxis_range=xaxis_range, yaxis_range=yaxis_range, interacting_chains=interacting_chains[:4], files_to_plot=files_to_plot, chimeras=True)


In [16]:
# Define the path to the CSV file
csv_file_path = "../../data/3d_predictions/chimeras_5seeds/average_interactions_chimeras.csv"

# Process the DataFrame
df_processed = process_dataframe(csv_file_path)

# Define the threshold value for distance filtering
distance_threshold = 5.5
# Create and display the plots with specified x-axis range, interacting chains, and files to plot
xaxis_range = [0, 340]  # Example x-axis range
yaxis_range = [2.25, distance_threshold + 0.25]
interacting_chains = ['ATP', 'beta-tubulin', 'alpha-tubulin', ]#'kinesin B']  # Example interacting chains to display

files_to_plot = None #['A', 'H']  # Example files to plot

plots = create_plots(df_processed, distance_threshold, xaxis_range=xaxis_range, yaxis_range=yaxis_range, interacting_chains=interacting_chains[:4], files_to_plot=files_to_plot, chimeras=True)


Let's explore the aminoacid differences for our chimeras. Let's do it by
interacting partner.

-ATP, 1/3
The ony relevant difference seems to be that in A there is an H at position 93,
which gets substituded by an F in H (positively charged to nonpolar). Also in A there is an S in position 95,
which get's replaced by a G in H (polar uncharged to nonpolar). The latter is
placed in the very p-loop domain.

-ATP, 2/3 
The only difference is that H in position 207 in A that gets replaced by an E in H (positively charged to negatively charged).

In [9]:
# def process_and_save_interactions(input_csv_path, output_csv_path):
#     df = pd.read_csv(input_csv_path)

#     df = df.loc[df['chain'] == 'A']

#     # Select the shortest distance interaction for each resi and interacting_resinumber
#     df = df.loc[df.groupby(['file', 'resi', 'interacting_resinumber'])['distance (angstroms)'].idxmin()]

#     df['seed'] = df['file'].str.extract(r'_seed(\d+)')[0].str.upper()
#     df['file'] = df['file'].str.replace(r'_seed\d+_model_0', '', regex=True).str.strip()

#     # Create a column with a tuple that includes resi and interacting_resinumber for each unique
#     df['resi_interacting_resinumber'] = df[['resi', 'interacting_resinumber']].apply(lambda x: (x['resi'], x['interacting_resinumber']), axis=1)

#     # Process each unique file separately
#     unique_files = df['file'].unique()
#     dfs = []

#     for file in unique_files:
#         df_file = df[df['file'] == file]

#         # Find the common interactions across all seeds of the current file
#         common_interactions = df_file.groupby('seed')['resi_interacting_resinumber'].apply(set).reset_index()
#         common_interactions = set.intersection(*common_interactions['resi_interacting_resinumber'])

#         # Filter the DataFrame to keep only the common interactions
#         df_file = df_file[df_file['resi_interacting_resinumber'].isin(common_interactions)]

#         # Compute the average distance for each interacting pair across the seeds
#         df_file = df_file.groupby(['file', 'chain', 'resi', 'resn', 'interacting_chain', 'interacting_resn', 'interacting_resinumber', 'residue_one_letter', 'interactingresi_oneletter'])['distance (angstroms)'].mean().reset_index()

#         dfs.append(df_file)

#     # Merge all the DataFrames
#     df_final = pd.concat(dfs, ignore_index=True)

#     # Now, we keep only those residues that interact with two or more unique chains
#     chain_counts = df_final.groupby('resi')['interacting_resinumber'].nunique()
#     dual_interactions = chain_counts[chain_counts >= 2].index

#     # Filter the DataFrame to keep only residues with dual interactions
#     df_final = df_final[df_final['resi'].isin(dual_interactions)]

#     # Additionally, ensure that the final df only includes interactions with two or more chains
#     df_final = df_final.groupby('resi').filter(lambda x: x['interacting_chain'].nunique() >= 2)

#     # Save the final DataFrame
#     df_final.to_csv(output_csv_path, index=False)

#     # Optional: Mapping chain names
#     chain_map = {
#         'B': 'kinesin B',
#         'C': 'alpha-tubulin',
#         'D': 'beta-tubulin',
#         'E': 'ATP',
#         'F': 'ATP'
#     }

#     df_final['interacting_chain'] = df_final['interacting_chain'].map(chain_map)
    
#     return df_final

# # Example usage
# csv_file_path = "../../data/3d_predictions/chimeras_5seeds/interactions_chimeras.csv"
# output_csv_path = "../../data/3d_predictions/chimeras_5seeds/average_dual_interactions_chimeras.csv"
# df_dual_interactions = process_and_save_interactions(csv_file_path, output_csv_path)
# df_dual_interactions


In [10]:
# # Define the path to the CSV file
# csv_file_path = "../../data/3d_predictions/chimeras_5seeds/average_dual_interactions_chimeras.csv"

# # Process the DataFrame
# df_processed = process_dataframe(csv_file_path)

# # Define the threshold value for distance filtering
# distance_threshold = 7
# # Create and display the plots with specified x-axis range, interacting chains, and files to plot
# xaxis_range = [0, 600]  # Example x-axis range
# yaxis_range = [1.5, distance_threshold + 0.5]
# interacting_chains = ['ATP', 'beta-tubulin', 'alpha-tubulin', 'kinesin B']  # Example interacting chains to display

# files_to_plot = None #['A', 'C', 'H']  # Example files to plot

# plots = create_plots(df_processed, distance_threshold, xaxis_range=xaxis_range, yaxis_range=yaxis_range, interacting_chains=interacting_chains[:4], files_to_plot=files_to_plot, chimeras=True)


In [None]:
import plotly.express as px
from ipywidgets import interact, FloatSlider, IntRangeSlider
import numpy as np


# Function to update the plot based on the distance threshold and resi range
def update_plot(distance_threshold, resi_range):
    

    # Define marker_symbols and color_map
    marker_symbols = {
        'ATP': 'circle',
        'beta-tubulin': 'square',
        'alpha-tubulin': 'diamond',
        'kinesin B': 'cross'
    }

    color_map = {
        'kinesin B': '#1f77b4',  # blue
        'alpha-tubulin': '#ff7f0e',  # orange
        'beta-tubulin': '#2ca02c',  # green
        'ATP': '#d62728',  # red
    }

    df_filtered = df_average[(df_average['distance (angstroms)'] <= distance_threshold) & 
                             (df_average['resi'] >= resi_range[0]) & 
                             (df_average['resi'] <= resi_range[1])]
    df_avg_distance = df_filtered.groupby(['file', 'interacting_chain']).agg({
        'distance (angstroms)': 'mean',
        'resi': 'count',
        'interactingresi_oneletter': lambda x: ', '.join(sorted(set(x)))
    }).reset_index()
    df_avg_distance = df_avg_distance.rename(columns={'resi': 'residues'})

    # Sort the DataFrame by 'file' column to ensure alphabetical order
    df_avg_distance = df_avg_distance.sort_values('file')

    # Add jitter to x coordinates only
    np.random.seed(42)  # Set a seed for consistency
    jitter_x = np.random.uniform(-0.1, 0.2, size=len(df_avg_distance))
    df_avg_distance['x_jittered'] = df_avg_distance['file'].astype('category').cat.codes + jitter_x

    fig = px.scatter(df_avg_distance, x='x_jittered', y='distance (angstroms)', 
                     color='interacting_chain', symbol='interacting_chain',
                     symbol_map=marker_symbols, color_discrete_map=color_map,
                     category_orders={"file": sorted(df_avg_distance['file'].unique())},
                     hover_data=['residues', 'file', 'distance (angstroms)', 'interactingresi_oneletter'],
                     text='residues')  # Add 'residues' as text

    fig.update_traces(marker=dict(size=10),
                      textposition='middle right')  # Position the text to the right of the markers
    fig.update_layout(title='Average Distance by File and Interacting Chain',
                      xaxis_title='File',
                      yaxis_title='Average Distance (Angstroms)',
                      height=800,  # Increased height to make the plot taller
                      xaxis=dict(tickmode='array', tickvals=list(range(len(df_avg_distance['file'].unique()))), ticktext=sorted(df_avg_distance['file'].unique())),
                    #   yaxis=dict(range=[3, 4])
                      )  
    return fig

# Create interactive sliders
interact(update_plot, 
         distance_threshold=FloatSlider(value=5.5, min=0, max=10, step=0.01, description='Distance Threshold'),
         resi_range=IntRangeSlider(value=[1, 401], min=1, max=600, step=1, description='Residue Range'))


- from 318 to 600, B and F don't have interaction with alpha tubulin
- from 1-141, F is the first to show interaction with alpha tubulin
- from 329 to 401 you can see how A has the most interactions with itself, and is most flexible than B, C and D. Here E and G are farther away. HF and G are about the same as A but less interactions
- if set to largest distance (7A), you can see how B and F have the least interactions with beta tubulin, which makes them interact closer.

In [None]:
# Create a dictionary to store dataframes for each file
dfs = {}

# Get unique files
files = df_average['file'].unique()

for file in files:
    # Create copy for this file
    df = df_average[df_average['file'] == file].copy()
    
    # Filter to get the shortest distances for each interaction
    df = df.loc[df[df['chain'] == 'A'].groupby(['resi', 'interacting_chain'])['distance (angstroms)'].idxmin()]
    
    # Filter by distance threshold
    df = df[df['distance (angstroms)'] <= 3.5]
    
    # Sort by resi
    df = df.sort_values(['resi'])
    
    # Store in dictionary
    dfs[file] = df

# Display all dataframes
for file, df in dfs.items():
    print(f"\nFile {file}:")
    display(df.head())

In [None]:
from Bio import AlignIO

# Define the path to your alignment file
alignment_path = "../../../../Downloads/clustalo-I20241108-132051-0935-36776795-p1m.aln-clustal_num"

# Read the alignment file in Clustal format
alignment = AlignIO.read(alignment_path, "clustal")

# Print the alignment to view it
print(alignment)

# Iterate through each sequence in the alignment and print details
for record in alignment:
    print(f"ID: {record.id}")
    print(f"Sequence: {record.seq}")
    print(f"Length: {len(record.seq)}\n")

In [None]:
dfs['A']

In [None]:
from Bio import AlignIO
import pandas as pd

# Read the alignment file
alignment_path = "../../../../Downloads/clustalo-I20241108-123213-0487-19034450-p1m.aln-clustal_num"
alignment = AlignIO.read(alignment_path, "clustal")

# Extract sequences for A and H
seq_A = None
seq_H = None

for record in alignment:
    if record.id == 'A':
        seq_A = record.seq
    elif record.id == 'H':
        seq_H = record.seq

# Create a mapping from H to A
mapping_H_to_A = {}
resi_A = 0
resi_H = 0

for a, h in zip(seq_A, seq_H):
    if a != '-':
        resi_A += 1
    if h != '-':
        resi_H += 1
        if a != '-':
            mapping_H_to_A[resi_H] = int(resi_A)

# Assuming df_H is your DataFrame for file H
df_H = dfs['H'].copy()

# Add the 'aligned resi' column
df_H['aligned resi'] = df_H['resi'].map(mapping_H_to_A).astype('Int64')

# Display the updated DataFrame for %%HTML  ~
df_H


## Now let's take a look at species seeds

In [9]:
# Function to generate plots without background color shapes
def create_plots(df, distance_threshold, xaxis_range=None, yaxis_range=None, interacting_chains=None, files_to_plot=None):
    unique_files = sorted(df['file'].unique())  # Sort the unique files alphabetically
    plots = []

    # Define a fixed color mapping for interacting chains
    color_map = {
        'kinesin B': '#1f77b4',  # blue
        'alpha-tubulin': '#ff7f0e',  # orange
        'beta-tubulin': '#2ca02c',  # green
        'ATP': '#d62728',  # red
    }

    # Define a fixed order for the legend
    legend_order = ['kinesin B', 'alpha-tubulin', 'beta-tubulin', 'ATP']

    # Define marker shapes for amino acid classifications
    shape_map = {
        'nonpolar': 'circle',
        'polar uncharged': 'square',
        'positively charged': 'triangle-up',
        'negatively charged': 'triangle-down'
    }

    # Filter the files to plot if specified
    if files_to_plot:
        unique_files = [file for file in unique_files if any(file.startswith(prefix) for prefix in files_to_plot)]

    for file in unique_files:
        df_file = df[df['file'] == file]

        # Filter to get the shortest distances for each interaction
        df_filtered = df_file.loc[df_file[df_file['chain'] == 'A'].groupby(['resi', 'interacting_chain'])['distance (angstroms)'].idxmin()]

        # Filter by distance threshold
        df_filtered = df_filtered[df_filtered['distance (angstroms)'] <= distance_threshold]

        # Filter by interacting chains if specified
        if interacting_chains:
            df_filtered = df_filtered[df_filtered['interacting_chain'].isin(interacting_chains)]

        # Create the interactive plot with consistent color mapping and shape mapping
        fig = px.scatter(df_filtered, x='resi', y='distance (angstroms)', color='interacting_chain', text='residue_one_letter',
                         symbol='aa_classification', symbol_map=shape_map,
                         color_discrete_map=color_map,
                         category_orders={'interacting_chain': legend_order},
                         labels={'resi': 'Residue Index', 'distance (angstroms)': 'Distance (angstroms)', 'interacting_chain': 'Interacting Chain'},
                         title=f"Interactions for {file}")

        # Update layout for better visualization
        fig.update_traces(textposition='top center', marker=dict(size=10, opacity=0.6))
        fig.update_layout(
            legend_title_text='Interacting Chain',
            yaxis_range=yaxis_range if yaxis_range else [0.1, distance_threshold + 0.5]  # Set y-axis range based on input or default
        )

        # Set x-axis range if specified
        if xaxis_range:
            fig.update_layout(xaxis_range=xaxis_range)

        # Add more ticks to the x-axis
        fig.update_xaxes(dtick=20)

        # Define plot size
        fig.update_layout(autosize=False, width=1600, height=600)

        # Show the plot
        fig.show()

        plots.append(fig)

    return plots

In [10]:
# Define the path to the CSV file
csv_file_path = "../../data/3d_predictions/species_seeds/interactions_chimeras.csv"

# Process the DataFrame
df_processed = process_dataframe(csv_file_path)

# Define the threshold value for distance filtering
distance_threshold = 4.2

# Create and display the plots with specified x-axis range, interacting chains, and files to plot
xaxis_range = [0, 600]  # Example x-axis range
yaxis_range = [1.5, distance_threshold + 0.5]
interacting_chains = ['ATP', 'beta-tubulin', 'alpha-tubulin', 'kinesin B']  # Example interacting chains to display

files_to_plot =  None#['A', 'H']  # Example files to plot

plots = create_plots(df_processed, distance_threshold, xaxis_range=xaxis_range, yaxis_range=yaxis_range, interacting_chains=interacting_chains[:4], files_to_plot=files_to_plot)


In [11]:
def process_and_save_interactions(input_csv_path, output_csv_path):
    df = pd.read_csv(input_csv_path)

    df = df.loc[df['chain'] == 'A']

    # Select the shortest distance interaction for each resi and interacting_resinumber
    df = df.loc[df.groupby(['file', 'resi', 'interacting_resinumber'])['distance (angstroms)'].idxmin()]

    df['seed'] = df['file'].str.extract(r'_seed(\d+)')[0].str.upper()
    df['file'] = df['file'].str.replace(r'_seed\d+_model_0', '', regex=True).str.strip()

    # Create a column with a tuple that includes resi and interacting_resinumber for each unique
    df['resi_interacting_resinumber'] = df[['resi', 'interacting_resinumber']].apply(lambda x: (x['resi'], x['interacting_resinumber']), axis=1)

    # Process each unique file separately
    unique_files = df['file'].unique()
    dfs = []

    for file in unique_files:
        df_file = df[df['file'] == file]

        # Find the common interactions across all seeds of the current file
        common_interactions = df_file.groupby('seed')['resi_interacting_resinumber'].apply(set).reset_index()
        common_interactions = set.intersection(*common_interactions['resi_interacting_resinumber'])

        # Filter the DataFrame to keep only the common interactions
        df_file = df_file[df_file['resi_interacting_resinumber'].isin(common_interactions)]

        # Compute the average distance for each interacting pair across the seeds
        df_file = df_file.groupby(['file', 'chain', 'resi', 'resn', 'interacting_chain', 'interacting_resn', 'interacting_resinumber', 'residue_one_letter', 'interactingresi_oneletter'])['distance (angstroms)'].mean().reset_index()

        dfs.append(df_file)

    # Merge all the DataFrames
    df_final = pd.concat(dfs, ignore_index=True)

    # Save the final DataFrame
    df_final.to_csv(output_csv_path, index=False)

    chain_map = {
        'B': 'kinesin B',
        'C': 'alpha-tubulin',
        'D': 'beta-tubulin',
        'E': 'ATP',
        'F': 'ATP'
    }

    df_final['interacting_chain'] = df_final['interacting_chain'].map(chain_map)
    
    return df_final

  

# Example usage
csv_file_path = "../../data/3d_predictions/species_seeds/interactions_chimeras.csv"
output_csv_path = "../../data/3d_predictions/species_seeds/average_interactions_species.csv"
df_average = process_and_save_interactions(csv_file_path, output_csv_path)
df_average

Unnamed: 0,file,chain,resi,resn,interacting_chain,interacting_resn,interacting_resinumber,residue_one_letter,interactingresi_oneletter,distance (angstroms)
0,Acsu2,A,9,ARG,ATP,ATP,1,R,?,3.833171
1,Acsu2,A,10,PHE,ATP,ATP,1,F,?,4.914591
2,Acsu2,A,11,ARG,ATP,ATP,1,R,?,3.733138
3,Acsu2,A,12,PRO,ATP,ATP,1,P,?,3.663855
4,Acsu2,A,14,ASN,ATP,ATP,1,N,?,5.810327
...,...,...,...,...,...,...,...,...,...,...
6047,Tila,A,505,ILE,kinesin B,LEU,501,I,L,2.817382
6048,Tila,A,505,ILE,kinesin B,LYS,504,I,K,6.127687
6049,Tila,A,508,ASP,kinesin B,ILE,505,D,I,5.939484
6050,Tila,A,508,ASP,kinesin B,TYR,509,D,Y,3.978279


In [13]:
# Define the path to the CSV file
csv_file_path = "../../data/3d_predictions/species_seeds/average_interactions_species.csv"

# Process the DataFrame
df_processed = process_dataframe(csv_file_path)

# Define the threshold value for distance filtering
distance_threshold = 4.5

# Create and display the plots with specified x-axis range, interacting chains, and files to plot
xaxis_range = [0, 600]  # Example x-axis range
yaxis_range = [1.5, distance_threshold + 0.5]
interacting_chains = ['beta-tubulin', ]  # Example interacting chains to display

files_to_plot = None#['Acsu2', 'Acsu', 'Heal'] # Example files to plot 

plots = create_plots(df_processed, distance_threshold, xaxis_range=xaxis_range, yaxis_range=yaxis_range, interacting_chains=interacting_chains[:4], files_to_plot=files_to_plot)


In [17]:
df = pd.read_csv("../../data/3d_predictions/jiapei/interactions_chimeras.csv")
# df = df[df['file'].isin(['A_seed55_model_0', 'H_seed44_model_0'])]
df

Unnamed: 0,file,chain,resi,resn,atom_name,interacting_atom,interacting_resn,interacting_chain,interacting_resinumber,distance (angstroms),residue_one_letter,full_atom_name,interacting_full_atom_name,interactingresi_oneletter
0,A_seed44_model_0,B,205,ASN,ND2,PB,ATP,F,2,4.058457,N,Nitrogen delta 2,Phosphorus beta,?
1,A_seed44_model_0,A,245,VAL,O,OH,TYR,C,108,6.872777,V,Oxygen,Unknown,Y
2,A_seed44_model_0,A,363,LYS,N,OD1,ASN,B,359,6.523417,K,Nitrogen,Oxygen delta 1,N
3,A_seed44_model_0,A,243,GLU,O,N,GLU,C,414,4.755854,E,Oxygen,Nitrogen,E
4,A_seed44_model_0,A,362,LEU,CG,CD2,LEU,B,362,6.237262,L,Carbon gamma,Carbon delta 2,L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32077,H_seed44_model_0,A,504,SER,CB,CE2,PHE,D,394,6.359934,S,Carbon beta,Carbon epsilon 2,F
32078,H_seed44_model_0,A,251,LEU,CD2,C,TYR,C,108,5.509418,L,Carbon delta 2,Carbon,Y
32079,H_seed44_model_0,A,469,LYS,CD,NZ,LYS,B,473,4.933531,K,Carbon delta,Nitrogen zeta,K
32080,H_seed44_model_0,A,235,LEU,C,O1B,ATP,E,1,6.663004,L,Carbon,Oxygen 1 beta,?


In [18]:
# In the column 'file', just leave the first letter
df['file'] = df['file'].str[0]
df


Unnamed: 0,file,chain,resi,resn,atom_name,interacting_atom,interacting_resn,interacting_chain,interacting_resinumber,distance (angstroms),residue_one_letter,full_atom_name,interacting_full_atom_name,interactingresi_oneletter
0,A,B,205,ASN,ND2,PB,ATP,F,2,4.058457,N,Nitrogen delta 2,Phosphorus beta,?
1,A,A,245,VAL,O,OH,TYR,C,108,6.872777,V,Oxygen,Unknown,Y
2,A,A,363,LYS,N,OD1,ASN,B,359,6.523417,K,Nitrogen,Oxygen delta 1,N
3,A,A,243,GLU,O,N,GLU,C,414,4.755854,E,Oxygen,Nitrogen,E
4,A,A,362,LEU,CG,CD2,LEU,B,362,6.237262,L,Carbon gamma,Carbon delta 2,L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32077,H,A,504,SER,CB,CE2,PHE,D,394,6.359934,S,Carbon beta,Carbon epsilon 2,F
32078,H,A,251,LEU,CD2,C,TYR,C,108,5.509418,L,Carbon delta 2,Carbon,Y
32079,H,A,469,LYS,CD,NZ,LYS,B,473,4.933531,K,Carbon delta,Nitrogen zeta,K
32080,H,A,235,LEU,C,O1B,ATP,E,1,6.663004,L,Carbon,Oxygen 1 beta,?


In [19]:
# Select only rows where the 'file' column is 'A' or 'H'
df_filtered = df[df['file'].isin(['A', 'H'])]
df_filtered



Unnamed: 0,file,chain,resi,resn,atom_name,interacting_atom,interacting_resn,interacting_chain,interacting_resinumber,distance (angstroms),residue_one_letter,full_atom_name,interacting_full_atom_name,interactingresi_oneletter
0,A,B,205,ASN,ND2,PB,ATP,F,2,4.058457,N,Nitrogen delta 2,Phosphorus beta,?
1,A,A,245,VAL,O,OH,TYR,C,108,6.872777,V,Oxygen,Unknown,Y
2,A,A,363,LYS,N,OD1,ASN,B,359,6.523417,K,Nitrogen,Oxygen delta 1,N
3,A,A,243,GLU,O,N,GLU,C,414,4.755854,E,Oxygen,Nitrogen,E
4,A,A,362,LEU,CG,CD2,LEU,B,362,6.237262,L,Carbon gamma,Carbon delta 2,L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32077,H,A,504,SER,CB,CE2,PHE,D,394,6.359934,S,Carbon beta,Carbon epsilon 2,F
32078,H,A,251,LEU,CD2,C,TYR,C,108,5.509418,L,Carbon delta 2,Carbon,Y
32079,H,A,469,LYS,CD,NZ,LYS,B,473,4.933531,K,Carbon delta,Nitrogen zeta,K
32080,H,A,235,LEU,C,O1B,ATP,E,1,6.663004,L,Carbon,Oxygen 1 beta,?


In [20]:
# Create a column named 'x' that combines the row values of the columns file, chain, resi, resn
df_filtered['x'] = df_filtered.apply(lambda row: f"{row['file']}_{row['chain']}_{row['resi']}_{row['resn']}", axis=1)
df_filtered

Unnamed: 0,file,chain,resi,resn,atom_name,interacting_atom,interacting_resn,interacting_chain,interacting_resinumber,distance (angstroms),residue_one_letter,full_atom_name,interacting_full_atom_name,interactingresi_oneletter,x
0,A,B,205,ASN,ND2,PB,ATP,F,2,4.058457,N,Nitrogen delta 2,Phosphorus beta,?,A_B_205_ASN
1,A,A,245,VAL,O,OH,TYR,C,108,6.872777,V,Oxygen,Unknown,Y,A_A_245_VAL
2,A,A,363,LYS,N,OD1,ASN,B,359,6.523417,K,Nitrogen,Oxygen delta 1,N,A_A_363_LYS
3,A,A,243,GLU,O,N,GLU,C,414,4.755854,E,Oxygen,Nitrogen,E,A_A_243_GLU
4,A,A,362,LEU,CG,CD2,LEU,B,362,6.237262,L,Carbon gamma,Carbon delta 2,L,A_A_362_LEU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32077,H,A,504,SER,CB,CE2,PHE,D,394,6.359934,S,Carbon beta,Carbon epsilon 2,F,H_A_504_SER
32078,H,A,251,LEU,CD2,C,TYR,C,108,5.509418,L,Carbon delta 2,Carbon,Y,H_A_251_LEU
32079,H,A,469,LYS,CD,NZ,LYS,B,473,4.933531,K,Carbon delta,Nitrogen zeta,K,H_A_469_LYS
32080,H,A,235,LEU,C,O1B,ATP,E,1,6.663004,L,Carbon,Oxygen 1 beta,?,H_A_235_LEU


In [21]:
# Create a column named 'x' that combines the row values of the columns file, chain, resi, resn
df_average['x'] = df_average.apply(lambda row: f"{row['file']}_{row['chain']}_{row['resi']}_{row['resn']}", axis=1)
df_average


Unnamed: 0,file,chain,resi,resn,interacting_chain,interacting_resn,interacting_resinumber,residue_one_letter,interactingresi_oneletter,distance (angstroms),x
0,A,A,18,ARG,ATP,ATP,1,R,?,3.804740,A_A_18_ARG
1,A,A,19,PHE,ATP,ATP,1,F,?,4.696334,A_A_19_PHE
2,A,A,20,ARG,ATP,ATP,1,R,?,3.641895,A_A_20_ARG
3,A,A,21,PRO,ATP,ATP,1,P,?,3.541921,A_A_21_PRO
4,A,A,23,ASN,ATP,ATP,1,N,?,5.803859,A_A_23_ASN
...,...,...,...,...,...,...,...,...,...,...,...
4732,H,A,579,VAL,kinesin B,ARG,566,V,R,4.842717,H_A_579_VAL
4733,H,A,579,VAL,kinesin B,LEU,563,V,L,4.625987,H_A_579_VAL
4734,H,A,580,PRO,kinesin B,ARG,566,P,R,4.409354,H_A_580_PRO
4735,H,A,580,PRO,kinesin B,GLN,559,P,Q,5.759448,H_A_580_PRO


In [22]:
# Filter df_filtered to only include rows where 'x' values are in df_average
df_filtered = df_filtered[df_filtered['x'].isin(df_average['x'])]

# change 'A' for 'K401' and 'H' for 'Kif3'
df_filtered['file'] = df_filtered['file'].replace({'A': 'K401', 'H': 'Kif3'})

# replace values in 'interacting_chain' according to the chain_map dictionary
chain_map = {
    'B': 'kinesin B',
    'C': 'alpha-tubulin',
    'D': 'beta-tubulin',
    'E': 'ATP',
    'F': 'ATP'
}
df_filtered['interacting_chain'] = df_filtered['interacting_chain'].replace(chain_map)
df_filtered



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,file,chain,resi,resn,atom_name,interacting_atom,interacting_resn,interacting_chain,interacting_resinumber,distance (angstroms),residue_one_letter,full_atom_name,interacting_full_atom_name,interactingresi_oneletter,x
1,K401,A,245,VAL,O,OH,TYR,alpha-tubulin,108,6.872777,V,Oxygen,Unknown,Y,A_A_245_VAL
2,K401,A,363,LYS,N,OD1,ASN,kinesin B,359,6.523417,K,Nitrogen,Oxygen delta 1,N,A_A_363_LYS
3,K401,A,243,GLU,O,N,GLU,alpha-tubulin,414,4.755854,E,Oxygen,Nitrogen,E,A_A_243_GLU
4,K401,A,362,LEU,CG,CD2,LEU,kinesin B,362,6.237262,L,Carbon gamma,Carbon delta 2,L,A_A_362_LEU
5,K401,A,244,LYS,CB,CG,GLU,alpha-tubulin,417,5.227245,K,Carbon beta,Carbon gamma,E,A_A_244_LYS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32076,Kif3,A,578,LEU,CB,CD2,LEU,kinesin B,563,6.661004,L,Carbon beta,Carbon delta 2,L,H_A_578_LEU
32077,Kif3,A,504,SER,CB,CE2,PHE,beta-tubulin,394,6.359934,S,Carbon beta,Carbon epsilon 2,F,H_A_504_SER
32078,Kif3,A,251,LEU,CD2,C,TYR,alpha-tubulin,108,5.509418,L,Carbon delta 2,Carbon,Y,H_A_251_LEU
32079,Kif3,A,469,LYS,CD,NZ,LYS,kinesin B,473,4.933531,K,Carbon delta,Nitrogen zeta,K,H_A_469_LYS


In [23]:
len(df.columns)

14

In [24]:
a = ['file', 'chain', 'resi', 'resn', 'residue_one_letter', 'atom_name', 'full_atom_name', 'interacting_resn', 'interacting_resinumber', 'interactingresi_oneletter', 'interacting_chain', 'interacting_atom','interacting_full_atom_name', 'distance (angstroms)', 'x']

len(a)

15

In [25]:
# save df_filtered to csv with columns ordered according to 'a' and without the column 'x'
df_filtered[a].drop(columns=['x']).to_csv("../../../../Downloads/interactions_chimeras_filtered.csv", index=False)
