In [61]:
import json 
import logging
import copy
import sys
from pathlib import Path
import pandas as pd
import ast
import numpy as np

# Dynamically get the backend path relative to this notebook
current_file = Path().resolve()
project_root = current_file.parents[1]  # Go up to project root
sys.path.insert(0, str(project_root))
local_path = project_root / 'back_end'
sys.path.insert(0, str(local_path))

from main import *
from data_manipulation import *

# =========================================
# Load the configuration, all within the runfile 
# =========================================

# change this to 4 or 5
multimer_size = 4
output_dir = project_root / 'back_end' / 'data' / 'reaction_database' / f'multimer_size_{multimer_size}'
# Read the CSV files into DataFrames
ubiquitin_history = pd.read_csv(output_dir / "ubiquitin_history.csv")
reaction_history = pd.read_csv(output_dir / "reaction_history.csv")
donor_history = pd.read_csv(output_dir / "donor_history.csv")
context_history = pd.read_csv(output_dir / "context_history.csv")


ubiquitin_history, context_history = global_deprotection_dual(ubiquitin_history, context_history)












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [62]:
def validate_most_frequent_index(current_df):
    """
    Validates that:
    - There is exactly one most frequent value in 'index' column
    - That value appears exactly 3 times

    Raises:
        ValueError if conditions are not met
    """
    try:
        # Count frequency of each value in 'index' column
        value_counts = current_df['index'].value_counts()

        # Get the top frequency and values with that frequency
        most_frequent_count = value_counts.iloc[0]
        top_values = value_counts[value_counts == most_frequent_count]

        # Check that there's exactly one top value and count is 3
        if len(top_values) != 1:
            raise ValueError("There is not a unique most frequent value.")
        if most_frequent_count != 3:
            raise ValueError(f"Most frequent value count is {most_frequent_count}, expected 3.")

        # Passed all checks
        most_frequent_value = top_values.index[0]
        print(f"Most frequent value: {most_frequent_value}")
        print(f"Count: {most_frequent_count}")

        return most_frequent_value

    except KeyError:
        raise KeyError("Column 'index' not found in the DataFrame.")
    except Exception as e:
        raise RuntimeError(f"Validation failed: {e}")

In [63]:
multimer_size = 4
# Set up output directory
output_dir = project_root / 'back_end' / 'data' / 'reaction_database' / f'multimer_size_{multimer_size}'

# Create the dictionary to hold the data
data_dict = {
    "ubiquitin_history": [],
    "reaction_history": [],
    "donor_history": [],
    "context_history": []
}
# Read the CSV files into DataFrames
data_dict['ubiquitin_history'] = pd.read_csv(output_dir / "ubiquitin_history.csv")
data_dict['reaction_history'] = pd.read_csv(output_dir / "reaction_history.csv")
data_dict['donor_history'] = pd.read_csv(output_dir / "donor_history.csv")
data_dict['context_history'] = pd.read_csv(output_dir / "context_history.csv")


In [64]:
ubiquitin_history, context_history = global_deprotection_dual(ubiquitin_history, context_history)











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [65]:
def filter_histories_by_number_of_SMAC(
    ubiquitin_history,
    reaction_history,
    donor_history,
    context_history,
    number_of_smac=0,
):
    """
    Filters reaction histories by the maximum number of ABOC_lysines in a specified column
    of the context_history DataFrame.

    Parameters:
    ----------
    ubiquitin_history : pd.DataFrame
        DataFrame containing ubiquitin JSON history.
    reaction_history : pd.DataFrame
        DataFrame containing reaction history.
    donor_history : pd.DataFrame
        DataFrame containing donor species history.
    context_history : pd.DataFrame
        DataFrame of context dictionaries.
    column_offset : int
        Offset from the end of the DataFrame to select the target column (default -2).

    Returns:
    -------
    tuple of pd.DataFrame:
        Filtered versions of (ubiquitin_history, reaction_history,
        donor_history, context_history) based on max ABOC_lysines.
    """

    def extract_aboc_length(cell_string):
        d = ast.literal_eval(cell_string)
        return len(d['ABOC_lysines'])
    
    def extract_smac_length(cell_string):
        d = ast.literal_eval(cell_string)
        return len(d['SMAC_lysines'])
    
    # Apply ABOC length extraction across context_history
    aboc_num_df = context_history.map(extract_aboc_length)

    # Apply SMAC length extraction across context_history
    smac_num_df = context_history.map(extract_smac_length)

    # Select the column of interest using offset
    final_product_column_aboc = aboc_num_df.iloc[:, -2]
    final_product_column_smac = smac_num_df.iloc[:, -2]

    # Identify rows where ABOC count is maximal
    max_value = final_product_column_aboc.max()

    # If number_of_smac is specified, adjust the max_value accordingly
    number_of_aboc = max_value - number_of_smac
     
    filtered_rows = final_product_column_aboc[(final_product_column_aboc == number_of_aboc) & (final_product_column_smac == number_of_smac)]
    selected_indexes = filtered_rows.index

    # Apply index filtering across all input histories
    return (
        ubiquitin_history.loc[selected_indexes],
        reaction_history.loc[selected_indexes],
        donor_history.loc[selected_indexes],
        context_history.loc[selected_indexes]
    )

In [66]:
ubiquitin_history_filtered, reaction_history_filtered, donor_history_filtered, context_history_filtered = filter_histories_by_number_of_SMAC(
    ubiquitin_history,
    reaction_history,
    donor_history,
    context_history,
    number_of_smac=0
)

In [67]:
from tests.test_data import ubiquitin_library

# Step 1: Reverse the dictionary
reversed_ubiquitin_library = {v: k for k, v in ubiquitin_library.items()}

# Example: if your dataframe is called df
# You can use .applymap() to apply this mapping to every cell
donors_mapped_df = donor_history_filtered.map(lambda x: reversed_ubiquitin_library.get(x, x))  # fallback to original if not found

# map ubiquitin history with reversed library
ubiquitin_history_mapped_df = ubiquitin_history_filtered.map(lambda x: reversed_ubiquitin_library.get(x, x))  # fallback to original if not found

# Set all values to NaN except for 'initial_acceptor'
cols_to_null = [col for col in ubiquitin_history_mapped_df.columns if col != 'initial_acceptor']
ubiquitin_history_mapped_df[cols_to_null] = np.nan

In [68]:
# Add the 'table_origin' column
donors_mapped_df['table_origin'] = 'Donors'
reaction_history_filtered['table_origin'] = 'Reactions'
ubiquitin_history_mapped_df['table_origin'] = 'Acceptor'

# Move 'table_origin' to the first column
def move_column_to_front(df, column_name):
    cols = [column_name] + [col for col in df.columns if col != column_name]
    return df[cols]

donors_mapped_df = move_column_to_front(donors_mapped_df, 'table_origin')
reaction_history_filtered = move_column_to_front(reaction_history_filtered, 'table_origin')
ubiquitin_history_mapped_df = move_column_to_front(ubiquitin_history_mapped_df, 'table_origin')

In [69]:
donors_mapped_df.reset_index(drop=False, inplace=True)
reaction_history_filtered.reset_index(drop=False, inplace=True)
ubiquitin_history_mapped_df.reset_index(drop=False, inplace=True)

combined_df = pd.concat(
    [donors_mapped_df, reaction_history_filtered, ubiquitin_history_mapped_df],
    axis=0,
    ignore_index=True
)

In [70]:
def validate_most_frequent_index(current_df):
    """
    Validates that:
    - There is exactly one most frequent value in 'index' column
    - That value appears exactly 3 times

    Returns:
        most_frequent_value if valid, else None
        error_message if error, else None
    """
    try:
        value_counts = current_df['index'].value_counts()
        most_frequent_count = value_counts.iloc[0]
        top_values = value_counts[value_counts == most_frequent_count]

        if len(top_values) != 1:
            return None, "Not a unique most frequent value."
        if most_frequent_count != 3:
            return None, f"Most frequent count = {most_frequent_count}, expected 3."

        return top_values.index[0], None

    except KeyError:
        return None, "Column 'index' not found in the DataFrame."
    except Exception as e:
        return None, f"Validation failed: {e}"

# open back_end/src/original_data/reaction_summeries/1mer__to_4_reaction_summary.csv
input_dir = project_root / 'back_end' / 'src' / 'original_data' / 'reaction_summeries' 
original_data_df = pd.read_csv(input_dir / "1mer__to_4_reaction_summary.csv")

# replace Ube13/Mms2_branching with Ubc13/Mms2
original_data_df = original_data_df.map(lambda x: "Ubc13/Mms2" if x == "Ube13/Mms2_branching" else x)
# replace Ube13/Mms2 with Ubc13/Mms2
original_data_df = original_data_df.map(lambda x: "Ubc13/Mms2" if x == "Ube13/Mms2" else x)
# replace Fake Wash with FAKE_deprot
original_data_df = original_data_df.map(lambda x: "FAKE_deprot" if x == "Fake_Wash" else x)
# replace Ubc13/Mms2 (branching) with Ubc13/Mms2 
combined_df = combined_df.map(lambda x: "Ubc13/Mms2" if x == "Ubc13/Mms2 (branching)" else x)
# replace Ubc2K with Ube2K 
combined_df = combined_df.map(lambda x: "Ubc13/Mms2" if x == "Ubc13/Mms2 (branching)" else x)

# Initiate empty list to hold the indexes
indexed_values = []
validation_errors = []

# Iterate through the first 14 rows of original_data_df
for i in range(14):
    try:
        # Extract data as before
        initial_acceptor = original_data_df.iloc[i * 2, 0]
        dimer_formation_donor = original_data_df.iloc[i * 2, 1]
        trimer_formation_donor = original_data_df.iloc[i * 2, 3]
        tetramer_formation_donor = original_data_df.iloc[i * 2, 5]

        dimer_formation_reaction = original_data_df.iloc[i * 2 + 1, 1]
        dimer_deprotection_reaction = original_data_df.iloc[i * 2 + 1, 2]
        trimer_formation_reaction = original_data_df.iloc[i * 2 + 1, 3]
        trimer_deprotection_reaction = original_data_df.iloc[i * 2 + 1, 4]
        tetramer_formation_reaction = original_data_df.iloc[i * 2 + 1, 5]

        # Filter combined_df
        current_df = combined_df[
            (
                (combined_df['dimer_formation'] == dimer_formation_reaction) &
                (combined_df['dimer_deprotection'] == dimer_deprotection_reaction) &
                (combined_df['trimer_formation'] == trimer_formation_reaction) &
                (combined_df['trimer_deprotection'] == trimer_deprotection_reaction) &
                (combined_df['tetramer_formation'] == tetramer_formation_reaction) &
                (combined_df['table_origin'] == 'Reactions')
            ) |
            (
                (combined_df['dimer_formation'] == dimer_formation_donor) &
                (combined_df['trimer_formation'] == trimer_formation_donor) &
                (combined_df['tetramer_formation'] == tetramer_formation_donor) &
                (combined_df['table_origin'] == 'Donors')
            ) |
            (
                (combined_df['initial_acceptor'] == initial_acceptor) &
                (combined_df['table_origin'] == 'Acceptor')
            )
        ]

        # Run validation
        most_frequent_value, error = validate_most_frequent_index(current_df)

        if error:
            validation_errors.append((i, error))
        else:
            indexed_values.append(int(most_frequent_value))

    except Exception as e:
        validation_errors.append((i, f"Unexpected error: {e}"))

# Report any validation issues after all iterations
if validation_errors:
    print("\nValidation Errors:")
    for i, msg in validation_errors:
        print(f" - Row {i}: {msg}")

In [None]:
ubiquitin_history_df = ubiquitin_history_filtered.reset_index()
context_history_df = context_history_filtered.reset_index()

# Filter rows where 'index' is in indexed_values
combined_filtered_df = combined_df[combined_df['index'].isin(indexed_values)]
ubiquitin_history_df = ubiquitin_history_df[ubiquitin_history_df['index'].isin(indexed_values)]
context_history_df = context_history_df[context_history_df['index'].isin(indexed_values)]

# reorder index based on order of indexed_values
combined_df = combined_df.set_index('index').loc[indexed_values].reset_index()
ubiquitin_history_df = ubiquitin_history_df.set_index('index').loc[indexed_values].reset_index()

# remove index column
combined_df = combined_df.drop(columns=['index', 'final_multimer'])
ubiquitin_history_df = ubiquitin_history_df.drop(columns=['index'])

# Save the final DataFrame to a CSV file
new_output_dir = project_root / 'back_end' / 'src' / 'confirmation_data' 

# Create the directory if it doesn't exist 
new_output_dir.mkdir(parents=True, exist_ok=True)

combined_df.to_csv(new_output_dir / f'multimer_size_{multimer_size}_reaction_database.csv', index=False)
ubiquitin_history_df.to_csv(new_output_dir / f'multimer_size_{multimer_size}_multimer_database.csv', index=False)

In [72]:
combined_df

Unnamed: 0,table_origin,initial_acceptor,dimer_formation,dimer_deprotection,trimer_formation,trimer_deprotection,tetramer_formation,final_multimer
0,Donors,,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_ABOC,
1,Reactions,,gp78/Ube2g2,SMAC_deprot,gp78/Ube2g2,SMAC_deprot,gp78/Ube2g2,
2,Acceptor,histag_ubi_ubq_1_K63_aboc,,,,,,
3,Donors,,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_SMAC,,ubi_ubq_1_K48_ABOC_K63_ABOC,
4,Reactions,,gp78/Ube2g2,SMAC_deprot,gp78/Ube2g2,SMAC_deprot,Ubc13/Mms2,
5,Acceptor,histag_ubi_ubq_1_K63_aboc,,,,,,
6,Donors,,ubi_ubq_1_K48_SMAC,,ubi_ubq_1_K48_ABOC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_ABOC,
7,Reactions,,gp78/Ube2g2,SMAC_deprot,gp78/Ube2g2,FAKE_deprot,Ubc13/Mms2,
8,Acceptor,histag_ubi_ubq_1_K63_aboc,,,,,,
9,Donors,,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_ABOC,
