In [12]:
import pandas as pd

# Define the path to the CSV file
csv_file_path = "../../Jazzer_surf/3d_predictions/chimeras/test/interactions_all_files.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Filter to get the shortest distances for each interaction
df = df.loc[df[df['chain'] == 'A'].groupby(['file', 'resi', 'interacting_chain'])['distance (angstroms)'].idxmin()]

# Exclude interactions with Mg and chain letters A, G, H
df = df[(df['interacting_resn'] != 'MG') & (~df['interacting_chain'].isin(['A', 'G', 'H']))].reset_index(drop=True)

# Map interacting_resn to single-letter codes for better readability
aa_dict = {
    'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G',
    'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
    'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
}
df['residue_one_letter'] = df['resn'].map(aa_dict)
df['residue'] = df['resi'].astype(str) + df['residue_one_letter']

# Replace chain letters with descriptive names
chain_map = {
    'B': 'kinesin B',
    'C': 'alpha-tubulin',
    'D': 'beta-tubulin',
    'E': 'ATP',
    'F': 'ATP'
}
df['interacting_chain'] = df['interacting_chain'].map(chain_map)

# Filter to specific files
df = df.loc[df['file'].isin([
    'A', 
    'C', 
    # 'H',
    ])]

# Group by file and interacting chain, keeping residues in order
grouped = df.groupby(['file', 'interacting_chain'])

# Create a list to store the output rows
output_rows = []

# Iterate over each group and extract information
for (file, chain), group in grouped:
    for _, row in group.iterrows():
        output_rows.append({
            'file': file,
            'interacting_chain': chain,
            'interacting_atom': row['interacting_atom'],
            'residue_number': row['resi'],
            'amino_acid': row['residue_one_letter'],
            'distance (angstroms)': row['distance (angstroms)'],
        })

# Convert the list to a DataFrame
interaction_df = pd.DataFrame(output_rows)

# Display the resulting DataFrame
interaction_df

interaction_df.to_csv("interaction_summary.csv", index=False)


In [13]:
import pandas as pd
import plotly.express as px

# Load the data
csv_file_path = "interaction_summary.csv"
interaction_df = pd.read_csv(csv_file_path)

# Create a new column for the x-axis labels by concatenating residue number and amino acid
interaction_df['residue_label'] = interaction_df['residue_number'].astype(str) + interaction_df['amino_acid']

# Sort the dataframe by file and residue number to maintain order
interaction_df = interaction_df.sort_values(by=['file', 'residue_number'])

# Convert residue_label to a categorical type with the correct order
interaction_df['residue_label'] = pd.Categorical(interaction_df['residue_label'], 
                                                 categories=sorted(interaction_df['residue_label'].unique(), key=lambda x: int(''.join(filter(str.isdigit, x)))), 
                                                 ordered=True)

# Scatter plot for interaction distances with custom x-axis labels
fig_scatter = px.scatter(interaction_df, x='residue_label', y='distance (angstroms)', color='interacting_chain', text='interacting_atom',
                         title='Interaction Distances by Residue and Interacting Chain',
                         labels={'residue_label': 'Residue (Number and Identity)', 'distance (angstroms)': 'Distance (angstroms)', 'interacting_chain': 'Interacting Chain'},
                         hover_data=['file'])

fig_scatter.update_traces(textposition='top center', marker=dict(size=10, opacity=0.6))
fig_scatter.update_layout(xaxis={'categoryorder':'array', 'categoryarray': interaction_df['residue_label'].cat.categories})

# Show the plot
fig_scatter.show()


In [14]:
import pandas as pd
import plotly.express as px

# # Load the data
# csv_file_path = "interaction_summary.csv"
# interaction_df = pd.read_csv(csv_file_path)

# Create a new column for the x-axis labels by concatenating residue number and amino acid
interaction_df['residue_label'] = interaction_df['residue_number'].astype(str) + interaction_df['amino_acid']

# Sort the dataframe by file and residue number to maintain order
interaction_df = interaction_df.sort_values(by=['file', 'residue_number'])

# Convert residue_label to a categorical type with the correct order
interaction_df['residue_label'] = pd.Categorical(interaction_df['residue_label'], 
                                                 categories=sorted(interaction_df['residue_label'].unique(), key=lambda x: int(''.join(filter(str.isdigit, x)))), 
                                                 ordered=True)

# Define a color map for interacting chains
color_map = {
    'kinesin B': 'blue',
    'alpha-tubulin': 'green',
    'beta-tubulin': 'orange',
    'ATP': 'yellow'
}

# Scatter plot for interaction distances, coloring by file
fig_scatter = px.scatter(interaction_df, x='residue_label', y='distance (angstroms)', color='file', text='interacting_atom',
                         title='Interaction Distances by Residue and File',
                         labels={'residue_label': 'Residue (Number and Identity)', 'distance (angstroms)': 'Distance (angstroms)', 'file': 'File'},
                         hover_data=['interacting_chain'])

fig_scatter.update_traces(textposition='top center', marker=dict(size=10, opacity=0.6))

# Custom x-axis with colored labels
fig_scatter.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(len(interaction_df['residue_label'].cat.categories))),
        ticktext=[f"<span style='color:{color_map[interaction_df[interaction_df['residue_label'] == res]['interacting_chain'].iloc[0]]}'>{res}</span>" for res in interaction_df['residue_label'].cat.categories],
        tickangle=45
    )
)

# Show the plot
fig_scatter.show()
