In [29]:
# import pandas as pd
# import plotly.express as px

# # Define the path to the CSV file
# csv_file_path = "../../data/3d_predictions/motor2x_ATP_ADP_Mg2x_alphaTub_betaTub/interactions_chimeras.csv"

# # Read the CSV file into a DataFrame
# df = pd.read_csv(csv_file_path)

# # Function to create a plot for each file
# def create_plots(df):
#     unique_files = df['file'].unique()
#     plots = []

#     # Define a fixed color mapping for interacting chains
#     color_map = {
#         'kinesin B': '#1f77b4',  # blue
#         'alpha-tubulin': '#ff7f0e',  # orange
#         'beta-tubulin': '#2ca02c',  # green
#         'ATP': '#d62728',  # red
#     }

#     for file in unique_files:
#         df_file = df[df['file'] == file]

#         # Filter to get the shortest distances for each interaction
#         df_filtered = df_file.loc[df_file[df_file['chain'] == 'A'].groupby(['resi', 'interacting_chain'])['distance (angstroms)'].idxmin()]


#         # Map interacting_resn to single-letter codes for better readability
#         aa_dict = {
#             'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G',
#             'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
#             'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
#         }
#         df_filtered['residue_one_letter'] = df_filtered['resn'].map(aa_dict)
#         df_filtered['residue'] = df_filtered['resi'].astype(str) + df_filtered['residue_one_letter']

#         # Replace chain letters with descriptive names
#         chain_map = {
#             'B': 'kinesin B',
#             'C': 'alpha-tubulin',
#             'D': 'beta-tubulin',
#             'E': 'ATP',
#             'F': 'ATP'
#         }
#         df_filtered['interacting_chain'] = df_filtered['interacting_chain'].map(chain_map)

#         # Create the interactive plot with consistent color mapping
#         fig = px.scatter(df_filtered, x='resi', y='distance (angstroms)', color='interacting_chain', text='residue_one_letter',
#                          color_discrete_map=color_map,
#                          labels={'resi': 'Residue Index', 'distance (angstroms)': 'Distance (angstroms)', 'interacting_chain': 'Interacting Chain'},
#                          title=f"Interactions for {file}")

#         # Update layout for better visualization
#         fig.update_traces(textposition='top center', marker=dict(size=10, opacity=0.6))
#         fig.update_layout(
#             legend_title_text='Interacting Chain',
#             xaxis_range=[0, 401],  # Set a fixed range for the x-axis
#             yaxis_range=[0, 5.25]  # Set a fixed range for the y-axis
#         )

#         # Show the plot
#         fig.show()

#         plots.append(fig)

#     return plots

# # Create and display the plots
# plots = create_plots(df)


In [51]:
import pandas as pd
import plotly.express as px

# Define the path to the CSV file
csv_file_path = "../../data/3d_predictions/motor2x_ATP_ADP_Mg2x_alphaTub_betaTub/interactions_chimeras.csv"


# Function to read and process the DataFrame with a distance threshold
def process_dataframe(csv_file_path, distance_threshold):
    df = pd.read_csv(csv_file_path)

    # Filter out rows where the distance is greater than the threshold
    df = df[df['distance (angstroms)'] <= distance_threshold]

    # Map interacting_resn to single-letter codes for better readability
    aa_dict = {
        'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G',
        'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
        'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
    }

    # Replace chain letters with descriptive names
    chain_map = {
        'B': 'kinesin B',
        'C': 'alpha-tubulin',
        'D': 'beta-tubulin',
        'E': 'ATP',
        'F': 'ATP'
    }

    df['residue_one_letter'] = df['resn'].map(aa_dict)
    df['residue'] = df['resi'].astype(str) + df['residue_one_letter']
    df['interacting_chain'] = df['interacting_chain'].map(chain_map)
    
    return df

# Function to generate plots
def create_plots(df):
    unique_files = df['file'].unique()
    plots = []

    # Define a fixed color mapping for interacting chains
    color_map = {
        'kinesin B': '#1f77b4',  # blue
        'alpha-tubulin': '#ff7f0e',  # orange
        'beta-tubulin': '#2ca02c',  # green
        'ATP': '#d62728',  # red
    }

    for file in unique_files:
        df_file = df[df['file'] == file]

        # Filter to get the shortest distances for each interaction
        df_filtered = df_file.loc[df_file[df_file['chain'] == 'A'].groupby(['resi', 'interacting_chain'])['distance (angstroms)'].idxmin()]

        # Create the interactive plot with consistent color mapping
        fig = px.scatter(df_filtered, x='resi', y='distance (angstroms)', color='interacting_chain', text='residue_one_letter',
                         color_discrete_map=color_map,
                         labels={'resi': 'Residue Index', 'distance (angstroms)': 'Distance (angstroms)', 'interacting_chain': 'Interacting Chain'},
                         title=f"Interactions for {file}")

        # Update layout for better visualization
        fig.update_traces(textposition='top center', marker=dict(size=10, opacity=0.6))
        fig.update_layout(
            legend_title_text='Interacting Chain',
            # xaxis_range=[0, 401],  # Set a fixed range for the x-axis
            yaxis_range=[0, 5.25]  # Set a fixed range for the y-axis
        )

        # Show the plot
        fig.show()

        plots.append(fig)

    return plots

# Define the threshold value for distance filtering
distance_threshold = 3.5

# Process the DataFrame with a distance threshold
df_processed = process_dataframe(csv_file_path, distance_threshold)

# Create and display the plots
plots = create_plots(df_processed)



In [62]:
# New function to extract interacting amino acids and positions with 'file', 'resi', 'aminoacid', and 'interacting_partner'
def extract_interacting_amino_acids(df):
    # Filter for rows where chain 'A' is interacting with other chains
    df_chain_A = df[df['chain'] == 'A'].copy()  # Use .copy() to avoid modifying a view

    # Select relevant columns: file, residue index, one-letter amino acid code, interacting chain
    interactions = df_chain_A[['file', 'resi', 'residue_one_letter', 'interacting_chain']]

    # Remove duplicates (in case there are multiple interactions at the same residue)
    interactions = interactions.drop_duplicates()

    # Rename columns to match your requirements
    interactions.rename(columns={'file': 'file', 'residue_one_letter': 'aminoacid', 'interacting_chain': 'interacting_partner'}, inplace=True)

    # Sort by 'file' and 'resi' to ensure correct ranking
    interactions = interactions.sort_values(by=['file', 'resi'])

    # reset the index and delete the old index column
    interactions.reset_index(drop=True, inplace=True)

    # Return the final DataFrame with 'file', 'resi', 'aminoacid', and 'interacting_partner'
    return interactions

# Extract interacting amino acids and their partners with 'file', 'resi', 'aminoacid', and 'interacting_partner'
interacting_amino_acids = extract_interacting_amino_acids(df_processed)
interacting_amino_acids

Unnamed: 0,file,resi,aminoacid,interacting_partner
0,A,20,R,ATP
1,A,21,P,ATP
2,A,51,K,beta-tubulin
3,A,94,T,ATP
4,A,95,S,ATP
...,...,...,...,...
615,H,563,L,kinesin B
616,H,566,R,kinesin B
617,H,567,I,kinesin B
618,H,575,I,kinesin B


In [65]:
# Function to extract amino acid sequences and export them to a .fasta file
def extract_aminoacid_sequence_to_fasta(df, fasta_filename):
    # Filter for rows where chain 'A' is interacting with other chains
    df_chain_A = df[df['chain'] == 'A'].copy()  # Use .copy() to avoid modifying a view

    # Select relevant columns: file, residue index, one-letter amino acid code
    interactions = df_chain_A[['file', 'resi', 'residue_one_letter']]

    # Remove duplicates (in case there are multiple interactions at the same residue)
    interactions = interactions.drop_duplicates()

    # Sort by 'file' and 'resi' to ensure correct ranking
    interactions = interactions.sort_values(by=['file', 'resi'])

    # Group by 'file' and concatenate the amino acids in order of 'resi'
    aminoacid_sequences = interactions.groupby('file')['residue_one_letter'].agg(lambda x: ''.join(x)).reset_index()

    # Create the FASTA file
    with open(fasta_filename, 'w') as fasta_file:
        for _, row in aminoacid_sequences.iterrows():
            fasta_file.write(f">{row['file']}\n")
            fasta_file.write(f"{row['residue_one_letter']}\n")

# Define the filename for the output .fasta file
fasta_filename = '../../data/interacting_aminoacid_sequences.fasta'

# Extract amino acid sequences and write to .fasta file
extract_aminoacid_sequence_to_fasta(df_processed, fasta_filename)
