In [1]:
import json 
import logging
import copy
import sys
from pathlib import Path
import pandas as pd
import ast
import numpy as np

# Dynamically get the backend path relative to this notebook
current_file = Path().resolve()
project_root = current_file.parents[1]  # Go up to project root
sys.path.insert(0, str(project_root))
local_path = project_root / 'back_end'
sys.path.insert(0, str(local_path))

from main import *
from data_manipulation import *

multimer_size = 4
# Set up output directory
output_dir = project_root / 'back_end' / 'data' / 'reaction_database' / f'multimer_size_{multimer_size}'











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [2]:
multimer_size = 4
# Set up output directory
output_dir = project_root / 'back_end' / 'data' / 'reaction_database' / f'multimer_size_{multimer_size}'

# Create the dictionary to hold the data
data_dict = {
    "ubiquitin_history": [],
    "reaction_history": [],
    "donor_history": [],
    "context_history": []
}
# Read the CSV files into DataFrames
data_dict['ubiquitin_history'] = pd.read_csv(output_dir / "ubiquitin_history.csv")
data_dict['reaction_history'] = pd.read_csv(output_dir / "reaction_history.csv")
data_dict['donor_history'] = pd.read_csv(output_dir / "donor_history.csv")
data_dict['context_history'] = pd.read_csv(output_dir / "context_history.csv")


In [3]:
ubiquitin_history, context_history = global_deprotection_dual(ubiquitin_history, context_history)











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [4]:
def filter_histories_by_number_of_SMAC(
    ubiquitin_history,
    reaction_history,
    donor_history,
    context_history,
    number_of_smac=0,
):
    """
    Filters reaction histories by the maximum number of ABOC_lysines in a specified column
    of the context_history DataFrame.

    Parameters:
    ----------
    ubiquitin_history : pd.DataFrame
        DataFrame containing ubiquitin JSON history.
    reaction_history : pd.DataFrame
        DataFrame containing reaction history.
    donor_history : pd.DataFrame
        DataFrame containing donor species history.
    context_history : pd.DataFrame
        DataFrame of context dictionaries.
    column_offset : int
        Offset from the end of the DataFrame to select the target column (default -2).

    Returns:
    -------
    tuple of pd.DataFrame:
        Filtered versions of (ubiquitin_history, reaction_history,
        donor_history, context_history) based on max ABOC_lysines.
    """

    def extract_aboc_length(cell_string):
        d = ast.literal_eval(cell_string)
        return len(d['ABOC_lysines'])
    
    def extract_smac_length(cell_string):
        d = ast.literal_eval(cell_string)
        return len(d['SMAC_lysines'])
    
    # Apply ABOC length extraction across context_history
    aboc_num_df = context_history.map(extract_aboc_length)

    # Apply SMAC length extraction across context_history
    smac_num_df = context_history.map(extract_smac_length)

    # Select the column of interest using offset
    final_product_column_aboc = aboc_num_df.iloc[:, -2]
    final_product_column_smac = smac_num_df.iloc[:, -2]

    # Identify rows where ABOC count is maximal
    max_value = final_product_column_aboc.max()

    # If number_of_smac is specified, adjust the max_value accordingly
    number_of_aboc = max_value - number_of_smac
     
    filtered_rows = final_product_column_aboc[(final_product_column_aboc == number_of_aboc) & (final_product_column_smac == number_of_smac)]
    selected_indexes = filtered_rows.index

    # Apply index filtering across all input histories
    return (
        ubiquitin_history.loc[selected_indexes],
        reaction_history.loc[selected_indexes],
        donor_history.loc[selected_indexes],
        context_history.loc[selected_indexes]
    )

In [5]:
ubiquitin_history_filtered, reaction_history_filtered, donor_history_filtered, context_history_filtered = filter_histories_by_number_of_SMAC(
    ubiquitin_history,
    reaction_history,
    donor_history,
    context_history,
    number_of_smac=0
)

In [6]:
from tests.test_data import ubiquitin_library

# Step 1: Reverse the dictionary
reversed_ubiquitin_library = {v: k for k, v in ubiquitin_library.items()}

# Example: if your dataframe is called df
# You can use .applymap() to apply this mapping to every cell
donors_mapped_df = donor_history_filtered.map(lambda x: reversed_ubiquitin_library.get(x, x))  # fallback to original if not found

# map ubiquitin history with reversed library
ubiquitin_history_mapped_df = ubiquitin_history_filtered.map(lambda x: reversed_ubiquitin_library.get(x, x))  # fallback to original if not found

# Set all values to NaN except for 'initial_acceptor'
cols_to_null = [col for col in ubiquitin_history_mapped_df.columns if col != 'initial_acceptor']
ubiquitin_history_mapped_df[cols_to_null] = np.nan

In [7]:
# Add the 'table_origin' column
donors_mapped_df['table_origin'] = 'Donors'
reaction_history_filtered['table_origin'] = 'Reactions'
ubiquitin_history_mapped_df['table_origin'] = 'Acceptor'

# Move 'table_origin' to the first column
def move_column_to_front(df, column_name):
    cols = [column_name] + [col for col in df.columns if col != column_name]
    return df[cols]

donors_mapped_df = move_column_to_front(donors_mapped_df, 'table_origin')
reaction_history_filtered = move_column_to_front(reaction_history_filtered, 'table_origin')
ubiquitin_history_mapped_df = move_column_to_front(ubiquitin_history_mapped_df, 'table_origin')

In [8]:
donors_mapped_df.reset_index(drop=False, inplace=True)
reaction_history_filtered.reset_index(drop=False, inplace=True)
ubiquitin_history_mapped_df.reset_index(drop=False, inplace=True)

combined_df = pd.concat(
    [donors_mapped_df, reaction_history_filtered, ubiquitin_history_mapped_df],
    axis=0,
    ignore_index=True
)

In [9]:
# open back_end/src/original_data/reaction_summeries/1mer__to_4_reaction_summary.csv
input_dir = project_root / 'back_end' / 'src' / 'original_data' / 'reaction_summeries' 
original_data_df = pd.read_csv(input_dir / "1mer__to_4_reaction_summary.csv")

# replace Ube13/Mms2_branching with Ubc13/Mms2
original_data_df = original_data_df.map(lambda x: "Ubc13/Mms2" if x == "Ube13/Mms2_branching" else x)
# replace Ube13/Mms2 with Ubc13/Mms2
original_data_df = original_data_df.map(lambda x: "Ubc13/Mms2" if x == "Ube13/Mms2" else x)
# replace Fake Wash with FAKE_deprot
original_data_df = original_data_df.map(lambda x: "FAKE_deprot" if x == "Fake_Wash" else x)
# replace Ubc13/Mms2 (branching) with Ubc13/Mms2 
combined_df = combined_df.map(lambda x: "Ubc13/Mms2" if x == "Ubc13/Mms2 (branching)" else x)
# replace Ubc2K with Ube2K 
combined_df = combined_df.map(lambda x: "Ubc13/Mms2" if x == "Ubc13/Mms2 (branching)" else x)

# Initiate empty list to hold the indexes
indexed_values = []

# Take 2 rows from the original data
for i in range(14): 
    # Get the acceptor from the original data
    initial_acceptor = original_data_df.iloc[(i*2), 0]
    
    # Get donors from the original data
    dimer_formation_donor = original_data_df.iloc[(i*2), 1]
    trimer_formation_donor = original_data_df.iloc[(i*2), 3]
    tetramer_formation_donor = original_data_df.iloc[(i*2), 5]

    # Get reactions from the original data
    dimer_formation_reaction = original_data_df.iloc[(i*2)+1, 1]
    dimer_deprotectin_reaction = original_data_df.iloc[(i*2)+1, 2]
    trimer_formation_reaction = original_data_df.iloc[(i*2)+1, 3]
    trimer_deprotecton_reaction = original_data_df.iloc[(i*2)+1, 4]
    tetramer_formation_reaction = original_data_df.iloc[(i*2)+1, 5]
    
    #print(f"Acceptor: {initial_acceptor}")
    #print(f"Dimer Formation Donor: {dimer_formation_donor}")
    #print(f"Trimer Formation Donor: {trimer_formation_donor}")
    #print(f"Tetramer Formation Donor: {tetramer_formation_donor}")
    #print(f"Dimer Formation Reaction: {dimer_formation_reaction}") 
    #print(f"Dimer Deprotection Reaction: {dimer_deprotectin_reaction}")
    #print(f"Trimer Formation Reaction: {trimer_formation_reaction}")
    #print(f"Trimer Deprotection Reaction: {trimer_deprotecton_reaction}")
    #print(f"Tetramer Formation Reaction: {tetramer_formation_reaction}")

    # Create a new row for the combined DataFrame   
    current_df = combined_df[
            (
                (combined_df['dimer_formation'] == dimer_formation_reaction) & \
                (combined_df['dimer_deprotection'] == dimer_deprotectin_reaction) & \
                (combined_df['trimer_formation'] == trimer_formation_reaction) & \
                (combined_df['trimer_deprotection'] == trimer_deprotecton_reaction) & \
                (combined_df['tetramer_formation'] == tetramer_formation_reaction) & \
                (combined_df['table_origin'] == 'Reactions')
            )  | \
            (
                (combined_df['dimer_formation'] == dimer_formation_donor) & \
                (combined_df['trimer_formation'] == trimer_formation_donor) & \
                (combined_df['tetramer_formation'] == tetramer_formation_donor) & \
                (combined_df['table_origin'] == 'Donors')
            )  | \
            (
                (combined_df['initial_acceptor'] == initial_acceptor) & \
                (combined_df['table_origin'] == 'Acceptor')
            )
            ]

    # Replace 'column_name' with your actual column name
    value_counts = current_df['index'].value_counts()

    most_frequent_value = value_counts.index[0]
    most_frequent_count = value_counts.iloc[0]

    # Print the most frequent value and its count
    # TODO change to errors if not found
    print(f"Most frequent value: {most_frequent_value}")
    print(f"Count: {most_frequent_count}")

    # Append the most frequent value to the list
    indexed_values.append(int(most_frequent_value))

current_df

Most frequent value: 423
Count: 3
Most frequent value: 427
Count: 3
Most frequent value: 363
Count: 3
Most frequent value: 31
Count: 3
Most frequent value: 443
Count: 3
Most frequent value: 447
Count: 3
Most frequent value: 95
Count: 3
Most frequent value: 143
Count: 3
Most frequent value: 191
Count: 3
Most frequent value: 315
Count: 3
Most frequent value: 319
Count: 3
Most frequent value: 279
Count: 3
Most frequent value: 335
Count: 3
Most frequent value: 339
Count: 3


Unnamed: 0,index,table_origin,initial_acceptor,dimer_formation,dimer_deprotection,trimer_formation,trimer_deprotection,tetramer_formation,final_multimer
18,339,Donors,,ubi_ubq_1_K48_ABOC_K63_SMAC,,ubi_ubq_1_K48_ABOC_K63_SMAC,,ubi_ubq_1_K48_ABOC_K63_ABOC,
25,447,Donors,,ubi_ubq_1_K48_ABOC_K63_SMAC,,ubi_ubq_1_K48_ABOC_K63_SMAC,,ubi_ubq_1_K48_ABOC_K63_ABOC,
44,339,Reactions,,Ubc13/Mms2,SMAC_deprot,Ubc13/Mms2,SMAC_deprot,Ubc13/Mms2,
64,255,Acceptor,histag_ubi_ubq_1_K48_aboc,,,,,,
65,279,Acceptor,histag_ubi_ubq_1_K48_aboc,,,,,,
66,299,Acceptor,histag_ubi_ubq_1_K48_aboc,,,,,,
67,315,Acceptor,histag_ubi_ubq_1_K48_aboc,,,,,,
68,319,Acceptor,histag_ubi_ubq_1_K48_aboc,,,,,,
69,335,Acceptor,histag_ubi_ubq_1_K48_aboc,,,,,,
70,339,Acceptor,histag_ubi_ubq_1_K48_aboc,,,,,,


In [10]:
combined_df[combined_df['index'] == 31]

Unnamed: 0,index,table_origin,initial_acceptor,dimer_formation,dimer_deprotection,trimer_formation,trimer_deprotection,tetramer_formation,final_multimer
0,31,Donors,,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_ABOC,
26,31,Reactions,,gp78/Ube2g2,SMAC_deprot,gp78/Ube2g2,FAKE_deprot,Ubc13/Mms2,
52,31,Acceptor,histag_ubi_ubq_1,,,,,,


In [11]:
original_data_df

Unnamed: 0,0,1,2,3,4,5
0,histag_ubi_ubq_1_K63_aboc,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_ABOC
1,,gp78/Ube2g2,SMAC_deprot,gp78/Ube2g2,SMAC_deprot,gp78/Ube2g2
2,histag_ubi_ubq_1_K63_aboc,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_SMAC,,ubi_ubq_1_K48_ABOC_K63_ABOC
3,,gp78/Ube2g2,SMAC_deprot,gp78/Ube2g2,SMAC_deprot,Ubc13/Mms2
4,histag_ubi_ubq_1_K63_aboc,ubi_ubq_1_K48_SMAC,,ubi_ubq_1_K48_ABOC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_ABOC
5,,gp78/Ube2g2,SMAC_deprot,gp78/Ube2g2,FAKE_deprot,Ubc13/Mms2
6,histag_ubi_ubq_1,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_ABOC
7,,gp78/Ube2g2,SMAC_deprot,gp78/Ube2g2,FAKE_deprot,Ubc13/Mms2
8,histag_ubi_ubq_1_K63_aboc,ubi_ubq_1_K48_ABOC_K63_SMAC,,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_ABOC
9,,gp78/Ube2g2,SMAC_deprot,Ubc13/Mms2,SMAC_deprot,Ube2K


In [12]:
# open back_end/src/original_data/reaction_summeries/1mer__to_4_reaction_summary.csv
input_dir = project_root / 'back_end' / 'src' / 'original_data' / 'reaction_summeries' 
original_data_df = pd.read_csv(input_dir / "1mer__to_5_reaction_summary.csv")

# replace Ube13/Mms2_branching with Ubc13/Mms2
original_data_df = original_data_df.map(lambda x: "Ubc13/Mms2" if x == "Ube13/Mms2_branching" else x)
# replace Ube13/Mms2 with Ubc13/Mms2
original_data_df = original_data_df.map(lambda x: "Ubc13/Mms2" if x == "Ube13/Mms2" else x)
# replace Fake Wash with FAKE_deprot
original_data_df = original_data_df.map(lambda x: "FAKE_deprot" if x == "Fake_Wash" else x)
# replace Ubc13/Mms2 (branching) with Ubc13/Mms2 
combined_df = combined_df.map(lambda x: "Ubc13/Mms2" if x == "Ubc13/Mms2 (branching)" else x)
# replace Ubc2K with Ube2K 
combined_df = combined_df.map(lambda x: "Ubc13/Mms2" if x == "Ubc13/Mms2 (branching)" else x)

# Initiate empty list to hold the indexes
indexed_values = []

# Take 2 rows from the original data
for i in range(42): 
    # Get the acceptor from the original data
    initial_acceptor = original_data_df.iloc[(i*2), 0]
    
    # Get donors from the original data
    dimer_formation_donor = original_data_df.iloc[(i*2), 1]
    trimer_formation_donor = original_data_df.iloc[(i*2), 3]
    tetramer_formation_donor = original_data_df.iloc[(i*2), 5]
    pentamer_formation_donor = original_data_df.iloc[(i*2), 7]

    # Get reactions from the original data
    dimer_formation_reaction = original_data_df.iloc[(i*2)+1, 1]
    dimer_deprotectin_reaction = original_data_df.iloc[(i*2)+1, 2]
    trimer_formation_reaction = original_data_df.iloc[(i*2)+1, 3]
    trimer_deprotecton_reaction = original_data_df.iloc[(i*2)+1, 4]
    tetramer_formation_reaction = original_data_df.iloc[(i*2)+1, 5]
    tetramer_deprotecton_reaction = original_data_df.iloc[(i*2)+1, 6]
    pentamer_formation_reaction = original_data_df.iloc[(i*2)+1, 7]
    
    #print(f"Acceptor: {initial_acceptor}")
    #print(f"Dimer Formation Donor: {dimer_formation_donor}")
    #print(f"Trimer Formation Donor: {trimer_formation_donor}")
    #print(f"Tetramer Formation Donor: {tetramer_formation_donor}")
    #print(f"Dimer Formation Reaction: {dimer_formation_reaction}") 
    #print(f"Dimer Deprotection Reaction: {dimer_deprotectin_reaction}")
    #print(f"Trimer Formation Reaction: {trimer_formation_reaction}")
    #print(f"Trimer Deprotection Reaction: {trimer_deprotecton_reaction}")
    #print(f"Tetramer Formation Reaction: {tetramer_formation_reaction}")

    # Create a new row for the combined DataFrame   
    current_df = combined_df[
            (
                (combined_df['dimer_formation'] == dimer_formation_reaction) & \
                (combined_df['dimer_deprotection'] == dimer_deprotectin_reaction) & \
                (combined_df['trimer_formation'] == trimer_formation_reaction) & \
                (combined_df['trimer_deprotection'] == trimer_deprotecton_reaction) & \
                (combined_df['tetramer_formation'] == tetramer_formation_reaction) & \
                (combined_df['tetramer_deprotection'] == tetramer_deprotecton_reaction) & \
                (combined_df['pentamer_formation'] == pentamer_formation_reaction) & \
                (combined_df['table_origin'] == 'Reactions')
            )  | \
            (
                (combined_df['dimer_formation'] == dimer_formation_donor) & \
                (combined_df['trimer_formation'] == trimer_formation_donor) & \
                (combined_df['tetramer_formation'] == tetramer_formation_donor) & \
                (combined_df['pentamer_formation'] == pentamer_formation_donor) & \
                (combined_df['table_origin'] == 'Donors')
            )  | \
            (
                (combined_df['initial_acceptor'] == initial_acceptor) & \
                (combined_df['table_origin'] == 'Acceptor')
            )
            ]

    # Replace 'column_name' with your actual column name
    value_counts = current_df['index'].value_counts()

    most_frequent_value = value_counts.index[0]
    most_frequent_count = value_counts.iloc[0]

    # Print the most frequent value and its count
    # TODO change to errors if not found
    print(f"Most frequent value: {most_frequent_value}")
    print(f"Count: {most_frequent_count}")

    # Append the most frequent value to the list
    indexed_values.append(int(most_frequent_value))

current_df

KeyError: 'tetramer_deprotection'