In [1]:
# import pandas as pd
# import plotly.express as px

# # Define the path to the CSV file
# csv_file_path = "../../data/3d_predictions/motor2x_ATP_ADP_Mg2x_alphaTub_betaTub/interactions_chimeras.csv"

# # Read the CSV file into a DataFrame
# df = pd.read_csv(csv_file_path)

# # Function to create a plot for each file
# def create_plots(df):
#     unique_files = df['file'].unique()
#     plots = []

#     # Define a fixed color mapping for interacting chains
#     color_map = {
#         'kinesin B': '#1f77b4',  # blue
#         'alpha-tubulin': '#ff7f0e',  # orange
#         'beta-tubulin': '#2ca02c',  # green
#         'ATP': '#d62728',  # red
#     }

#     for file in unique_files:
#         df_file = df[df['file'] == file]

#         # Filter to get the shortest distances for each interaction
#         df_filtered = df_file.loc[df_file[df_file['chain'] == 'A'].groupby(['resi', 'interacting_chain'])['distance (angstroms)'].idxmin()]


#         # Map interacting_resn to single-letter codes for better readability
#         aa_dict = {
#             'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G',
#             'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
#             'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
#         }
#         df_filtered['residue_one_letter'] = df_filtered['resn'].map(aa_dict)
#         df_filtered['residue'] = df_filtered['resi'].astype(str) + df_filtered['residue_one_letter']

#         # Replace chain letters with descriptive names
#         chain_map = {
#             'B': 'kinesin B',
#             'C': 'alpha-tubulin',
#             'D': 'beta-tubulin',
#             'E': 'ATP',
#             'F': 'ATP'
#         }
#         df_filtered['interacting_chain'] = df_filtered['interacting_chain'].map(chain_map)

#         # Create the interactive plot with consistent color mapping
#         fig = px.scatter(df_filtered, x='resi', y='distance (angstroms)', color='interacting_chain', text='residue_one_letter',
#                          color_discrete_map=color_map,
#                          labels={'resi': 'Residue Index', 'distance (angstroms)': 'Distance (angstroms)', 'interacting_chain': 'Interacting Chain'},
#                          title=f"Interactions for {file}")

#         # Update layout for better visualization
#         fig.update_traces(textposition='top center', marker=dict(size=10, opacity=0.6))
#         fig.update_layout(
#             legend_title_text='Interacting Chain',
#             xaxis_range=[0, 401],  # Set a fixed range for the x-axis
#             yaxis_range=[0, 5.25]  # Set a fixed range for the y-axis
#         )

#         # Show the plot
#         fig.show()

#         plots.append(fig)

#     return plots

# # Create and display the plots
# plots = create_plots(df)


In [4]:
import pandas as pd
import plotly.express as px

# Define the path to the CSV file
csv_file_path = "../../data/3d_predictions/motor2x_ATP_ADP_Mg2x_alphaTub_betaTub/interactions_chimeras.csv"


# Function to read and process the DataFrame with a distance threshold
def process_dataframe(csv_file_path, distance_threshold):
    df = pd.read_csv(csv_file_path)

    # Filter out rows where the distance is greater than the threshold
    df = df[df['distance (angstroms)'] <= distance_threshold]

    # Map interacting_resn to single-letter codes for better readability
    aa_dict = {
        'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G',
        'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
        'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
    }

    # Replace chain letters with descriptive names
    chain_map = {
        'B': 'kinesin B',
        'C': 'alpha-tubulin',
        'D': 'beta-tubulin',
        'E': 'ATP',
        'F': 'ATP'
    }

    df['residue_one_letter'] = df['resn'].map(aa_dict)
    df['residue'] = df['resi'].astype(str) + df['residue_one_letter']
    df['interacting_chain'] = df['interacting_chain'].map(chain_map)
    
    return df

# Function to generate plots
def create_plots(df):
    unique_files = df['file'].unique()
    plots = []

    # Define a fixed color mapping for interacting chains
    color_map = {
        'kinesin B': '#1f77b4',  # blue
        'alpha-tubulin': '#ff7f0e',  # orange
        'beta-tubulin': '#2ca02c',  # green
        'ATP': '#d62728',  # red
    }

    for file in unique_files:
        df_file = df[df['file'] == file]

        # Filter to get the shortest distances for each interaction
        df_filtered = df_file.loc[df_file[df_file['chain'] == 'A'].groupby(['resi', 'interacting_chain'])['distance (angstroms)'].idxmin()]

        # Create the interactive plot with consistent color mapping
        fig = px.scatter(df_filtered, x='resi', y='distance (angstroms)', color='interacting_chain', text='residue_one_letter',
                         color_discrete_map=color_map,
                         labels={'resi': 'Residue Index', 'distance (angstroms)': 'Distance (angstroms)', 'interacting_chain': 'Interacting Chain'},
                         title=f"Interactions for {file}")

        # Update layout for better visualization
        fig.update_traces(textposition='top center', marker=dict(size=10, opacity=0.6))
        fig.update_layout(
            legend_title_text='Interacting Chain',
            # xaxis_range=[0, 401],  # Set a fixed range for the x-axis
            yaxis_range=[1.5, 5.5]  # Set a fixed range for the y-axis
        )

        # Show the plot
        fig.show()

        plots.append(fig)

    return plots

# Define the threshold value for distance filtering
distance_threshold = 5

# Process the DataFrame with a distance threshold
df_processed = process_dataframe(csv_file_path, distance_threshold)

# Create and display the plots
plots = create_plots(df_processed)



In [2]:
df_processed

Unnamed: 0,file,chain,resi,resn,atom_name,interacting_atom,interacting_resn,interacting_chain,interacting_resinumber,distance (angstroms),residue_one_letter,full_atom_name,interacting_full_atom_name,interactingresi_oneletter,residue
0,A,A,20,ARG,CB,N6,ATP,ATP,1,3.812164,R,Carbon beta,Nitrogen 6,?,20R
1,A,A,390,MET,CE,CB,ALA,kinesin B,374,4.839472,M,Carbon epsilon,Carbon beta,A,390M
2,A,A,281,THR,O,O,SER,beta-tubulin,420,4.738062,T,Oxygen,Oxygen,S,281T
3,A,E,1,ATP,O1B,N,THR,,99,3.176111,,Oxygen 1 beta,Nitrogen,T,
4,A,A,100,HIS,NE2,C6,ATP,ATP,1,3.924167,H,Nitrogen epsilon 2,Carbon 6,?,100H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29938,H,A,529,LYS,CE,CD1,LEU,kinesin B,528,4.207914,K,Carbon epsilon,Carbon delta 1,L,529K
29939,H,F,1,ADP,PA,C4',ADP,ATP,1,3.791272,,Phosphorus alpha,Carbon 4 prime,?,
29940,H,E,1,ATP,O1A,N,PHE,,93,2.869316,,Oxygen 1 alpha,Nitrogen,F,
29941,H,B,508,ASN,CB,NE1,TRP,beta-tubulin,397,4.689272,N,Carbon beta,Unknown,W,508N


In [5]:
# New function to extract interacting amino acids and positions with 'file', 'resi', 'aminoacid', and 'interacting_partner'
def extract_interacting_amino_acids(df):
    # Filter for rows where chain 'A' is interacting with other chains
    df_chain_A = df[df['chain'] == 'A'].copy()  # Use .copy() to avoid modifying a view

    # Select relevant columns: file, residue index, one-letter amino acid code, interacting chain
    interactions = df_chain_A[['file', 'resi', 'residue_one_letter', 'interacting_chain']]

    # Remove duplicates (in case there are multiple interactions at the same residue)
    interactions = interactions.drop_duplicates()

    # Rename columns to match your requirements
    interactions.rename(columns={'file': 'file', 'residue_one_letter': 'aminoacid', 'interacting_chain': 'interacting_partner'}, inplace=True)

    # Sort by 'file' and 'resi' to ensure correct ranking
    interactions = interactions.sort_values(by=['file', 'resi'])

    # reset the index and delete the old index column
    interactions.reset_index(drop=True, inplace=True)

    # Return the final DataFrame with 'file', 'resi', 'aminoacid', and 'interacting_partner'
    return interactions

# Extract interacting amino acids and their partners with 'file', 'resi', 'aminoacid', and 'interacting_partner'
interacting_amino_acids = extract_interacting_amino_acids(df_processed)
interacting_amino_acids

Unnamed: 0,file,resi,aminoacid,interacting_partner
0,A,18,R,ATP
1,A,19,F,ATP
2,A,20,R,ATP
3,A,21,P,ATP
4,A,51,K,beta-tubulin
...,...,...,...,...
1126,H,575,I,kinesin B
1127,H,577,S,kinesin B
1128,H,578,L,kinesin B
1129,H,579,V,kinesin B


In [11]:
# lets expand interacting_amino_acids.loc[interacting_amino_acids['file'] == 'A'] to also include only the ones with 'ATP as interacting_partner
A_ATP = interacting_amino_acids.loc[(interacting_amino_acids['file'] == 'A') & (interacting_amino_acids['interacting_partner'] == 'beta-tubulin') ]

A_ATP

Unnamed: 0,file,resi,aminoacid,interacting_partner
4,A,51,K,beta-tubulin
15,A,148,K,beta-tubulin
16,A,159,N,beta-tubulin
17,A,162,V,beta-tubulin
18,A,163,H,beta-tubulin
19,A,164,E,beta-tubulin
20,A,165,D,beta-tubulin
22,A,166,K,beta-tubulin
24,A,168,R,beta-tubulin
25,A,171,Y,beta-tubulin


In [6]:
# # New function to extract interacting amino acids and positions for each chain automatically
# def extract_interacting_amino_acids_per_chain(df):
#     # Create an empty dictionary to store DataFrames for each chain
#     chain_dataframes = {}

#     # Get all unique chains from the DataFrame (e.g., A, B, C, D, etc.)
#     unique_chains = df['chain'].unique()

#     # Loop through each chain and process interactions
#     for chain in unique_chains:
#         # Filter for rows where the current chain is interacting with other chains
#         df_chain = df[df['chain'] == chain].copy()  # Use .copy() to avoid modifying a view

#         # Select relevant columns: file, residue index, one-letter amino acid code, interacting chain
#         interactions = df_chain[['file', 'resi', 'residue_one_letter', 'interacting_chain']]

#         # Remove duplicates (in case there are multiple interactions at the same residue)
#         interactions = interactions.drop_duplicates()

#         # Rename columns to match your requirements
#         interactions.rename(columns={'file': 'file', 'residue_one_letter': 'aminoacid', 'interacting_chain': 'interacting_partner'}, inplace=True)

#         # Corrected line to sort by 'file' and 'resi'
#         interactions = interactions.sort_values(['file', 'resi'])

#         # Reset the index and delete the old index column
#         interactions.reset_index(drop=True, inplace=True)

#         # Store the DataFrame in the dictionary with the chain as the key
#         chain_dataframes[chain] = interactions

#     # Return the dictionary containing DataFrames for each chain
#     return chain_dataframes

# # Usage of the function
# interacting_amino_acids_per_chain = extract_interacting_amino_acids_per_chain(df_processed)

# # Dynamically access all DataFrames for all chains
# for chain, df_chain in interacting_amino_acids_per_chain.items():
#     print(f"Chain {chain}:")
#     print(df_chain.head())  # Display the first few rows of each chain's DataFrame


In [68]:
# Function to extract amino acid sequences and export them to a .fasta file
def extract_aminoacid_sequence_to_fasta(df, fasta_filename):
    # Filter for rows where chain 'A' is interacting with other chains
    df_chain_A = df[df['chain'] == 'A'].copy()  # Use .copy() to avoid modifying a view

    # Select relevant columns: file, residue index, one-letter amino acid code
    interactions = df_chain_A[['file', 'resi', 'residue_one_letter']]

    # Remove duplicates (in case there are multiple interactions at the same residue)
    interactions = interactions.drop_duplicates()

    # Sort by 'file' and 'resi' to ensure correct ranking
    interactions = interactions.sort_values(by=['file', 'resi'])

    # Group by 'file' and concatenate the amino acids in order of 'resi'
    aminoacid_sequences = interactions.groupby('file')['residue_one_letter'].agg(lambda x: ''.join(x)).reset_index()

    # Create the FASTA file
    with open(fasta_filename, 'w') as fasta_file:
        for _, row in aminoacid_sequences.iterrows():
            fasta_file.write(f">{row['file']}\n")
            fasta_file.write(f"{row['residue_one_letter']}\n")

# Define the filename for the output .fasta file
fasta_filename = '../../data/interacting_aminoacid_sequences.fasta'

# Extract amino acid sequences and write to .fasta file
extract_aminoacid_sequence_to_fasta(df_processed, fasta_filename)
