In [210]:

import sys
from pathlib import Path
import pandas as pd

# Handle paths for Jupyter (where __file__ is not defined)
try:
    current_path = Path(__file__).resolve()
except NameError:
    # __file__ is not defined in Jupyter; use cwd as fallback
    current_path = Path.cwd()

# Set up project paths
project_root = current_path.parents[1] if len(current_path.parents) >= 3 else current_path
local_path = project_root / 'back_end'

# Add project paths to sys.path if not already present
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
if str(local_path) not in sys.path:
    sys.path.insert(0, str(local_path))

# Import project modules (will work if path is correct)
import src.utils.utils
import src.utils.logging_utils
import src.main as main
import src.plotting as plotting

# Multimer size
jsonOutput = [
    {
      "from": 0,
      "to": 2,
      "linkage": "K63"
    },
    {
      "from": 2,
      "to": 4,
      "linkage": "K48"
    },
    {
      "from": 4,
      "to": 7,
      "linkage": "K48"
    },
    {
      "from": 7,
      "to": 11,
      "linkage": "K48"
    }
  ]
multimer_size = len(jsonOutput) + 1

# Function to load data
def download_data_dict(multimer_size):
    input_dir = project_root / 'back_end' / 'data' / 'filtered_reaction_database' / f'multimer_size_{multimer_size}'
    combined_database = pd.read_csv(input_dir / 'combined_database.csv', index_col=0)
    context_history = pd.read_csv(input_dir / 'context_history.csv', index_col=0)
    donor_history = pd.read_csv(input_dir / 'donor_history.csv', index_col=0)
    reaction_history = pd.read_csv(input_dir / 'reaction_history.csv', index_col=0)
    ubiquitin_history = pd.read_csv(input_dir / 'ubiquitin_history.csv', index_col=0)
    return {
        'combined_database': combined_database,
        'context_history': context_history,
        'donor_history': donor_history,
        'reaction_history': reaction_history,
        'ubiquitin_history': ubiquitin_history
    }

# Load the data
data_dict = download_data_dict(multimer_size)
combined_database = data_dict['combined_database']
context_history = data_dict['context_history']
donor_history = data_dict['donor_history']
reaction_history = data_dict['reaction_history']
ubiquitin_history = data_dict['ubiquitin_history']

In [None]:
def ubiquitin_building_wo_iterate(
    parent_dictionary: dict | str,
    ubi_molecule_to_add: dict | str,
    bonding_ubiquitin_number: int,
    new_ubiquitin_number: int,
    lysine_residue: str
) -> dict:
    """
    Entry point for building a ubiquitin chain by attaching a molecule or protecting group.

    Args:
        parent_dictionary (dict or str): Input protein structure.
        ubi_molecule_to_add (dict or str): Ubiquitin or protecting group (SMAC/ABOC).
        bonding_ubiquitin_number (int): The chain number to which the molecule is added.
        lysine_residue (str): The specific lysine site for conjugation.

    Returns:
        dict: The updated protein dictionary with changes applied.
    """

    # Initialize context for chain numbering and length tracking
    context = {
        "chain_number_list": [1],
        "chain_length_list": [],
    }

    # Normalize input to dicts
    parent_dictionary = convert_json_to_dict(parent_dictionary)

    # Only convert to dict if not a protecting group
    if ubi_molecule_to_add not in ('SMAC', 'ABOC'):
        ubi_molecule_to_add = convert_json_to_dict(ubi_molecule_to_add)

    new_ubi_molecule_to_add = copy.deepcopy(ubi_molecule_to_add)
    new_ubi_molecule_to_add['chain_number'] = new_ubiquitin_number

     # Error handling fop invalid ubi_molecule_to_add
    if ubi_molecule_to_add not in ('SMAC', 'ABOC'):
        if not isinstance(ubi_molecule_to_add, dict):
            raise TypeError("ubi_molecule_to_add must be a dictionary or 'SMAC'/'ABOC' string")

    # Build structure recursively
    output_dictionary, output_context = inner_wrapper_ubiquitin_building_wo_iterate(
        parent_dictionary, new_ubi_molecule_to_add, bonding_ubiquitin_number, new_ubiquitin_number, lysine_residue, context
    )
    
    return output_dictionary, output_context

def inner_wrapper_ubiquitin_building_wo_iterate(
    input_dictionary: dict | str,
    ubi_molecule_to_add: dict | str,
    bonding_ubiquitin_number: int,
    new_ubiquitin_number: int,
    lysine_residue: str,
    context: dict
) -> tuple:
    """
    Recursively traverse and modify the ubiquitin structure to add new branches
    or protecting groups to a given lysine site.

    Args:
        input_dictionary (dict or str): The protein or ubiquitin structure.
        ubi_molecule_to_add (dict or str): Ubiquitin or protecting group.
        bonding_ubiquitin_number (int): The specific chain number to modify.
        lysine_residue (str): Lysine site to target.
        context (dict): Tracks chain numbering and lengths.

    Returns:
        tuple: Updated working dictionary and context.
    """
    # Deep copy to avoid mutating input
    working_dictionary = copy.deepcopy(input_dictionary)
    working_dictionary = convert_json_to_dict(working_dictionary)

    # Set the current chain number from context
    # working_dictionary['chain_number'] = context['chain_number_list'][-1]
    # Set and record chain length
    working_dictionary['chain_length'] = len(working_dictionary['FASTA_sequence'])
    context['chain_length_list'].append(working_dictionary['chain_length'])

    # Log protein details for debugging and traceability
    log_protein_details(working_dictionary, context)

    # Increment chain_number for future recursive calls
    context['chain_number_list'].append(context['chain_number_list'][-1] + 1)

    for bra in working_dictionary['branching_sites']:
        log_branching_details(bra, working_dictionary, context)

        # Apply modification logic to the lysine site
        bra, working_dictionary = handle_lysine_modification(
            bra, working_dictionary, ubi_molecule_to_add, bonding_ubiquitin_number, lysine_residue
        )

        # Log the state of the current branch
        if bra['children'] in ('SMAC', 'ABOC'):
            logging.info(f"Protecting Group: {bra['children']}")
        elif bra['children'] == "":
            logging.info(f"There is no Protecting Group on: {bra['site_name']}")
        elif isinstance(bra['children'], dict):
            logging.info(f"NEXT CHAIN: {bra['children']}")
            # Recursively process the next ubiquitin chain
            bra['children'], context = inner_wrapper_ubiquitin_building_wo_iterate(
                bra['children'], ubi_molecule_to_add, bonding_ubiquitin_number, new_ubiquitin_number, lysine_residue, context
            )
        log_end_of_branching()

    log_end_of_protein(working_dictionary)

    return working_dictionary, context

ubiquitin_monomer = {
    "protein": "1ubq",
    "chain_number": 1,
    "FASTA_sequence": "MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG",
    "chain_length": 76,
    "branching_sites": [{"site_name": "M1","sequence_id": "(M)QIF","children": ""},
                        {"site_name": "K6","sequence_id": "IFV(K)TLT","children": ""},
                        {"site_name": "K11","sequence_id": "LTG(K)TIT","children": ""},
                        {"site_name": "K27","sequence_id": "ENV(K)AKI","children": ""},
                        {"site_name": "K29","sequence_id": "VKA(K)IQD","children": ""},
                        {"site_name": "K33","sequence_id": "IQD(K)EGI","children": ""},
                        {"site_name": "K48","sequence_id": "FAG(K)QLE","children":""}, 
                        {"site_name": "K63","sequence_id": "NIQ(K)EST","children": ""}]}

histag_ubiquitin_monomer = {
    "protein": "1ubq",
    "chain_number": 0,
    "FASTA_sequence": "MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGGDHHHHHH",
    "chain_length": 83,
    "branching_sites": [{"site_name": "M1","sequence_id": "(M)QIF","children": ""},
                        {"site_name": "K6","sequence_id": "IFV(K)TLT","children": ""},
                        {"site_name": "K11","sequence_id": "LTG(K)TIT","children": ""},
                        {"site_name": "K27","sequence_id": "ENV(K)AKI","children": ""},
                        {"site_name": "K29","sequence_id": "VKA(K)IQD","children": ""},
                        {"site_name": "K33","sequence_id": "IQD(K)EGI","children": ""},
                        {"site_name": "K48","sequence_id": "FAG(K)QLE","children":""}, 
                        {"site_name": "K63","sequence_id": "NIQ(K)EST","children": ""}]}

In [211]:
multimer_size

5

In [214]:
plotting.build_reaction_dictionaries_for_UI(data_dict, indexes, multimer_size)

[{'ubi_his_JSON_index': 2059,
  'ubi_his_JSON_multimer_id': 'Ub5_7',
  'ubi_his_JSON_used_in_synthesis': 1,
  'ubi_his_JSON_initial_acceptor': "{'protein': '1ubq', 'chain_number': 1, 'FASTA_sequence': 'MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGGDHHHHHH', 'chain_length': 83, 'branching_sites': [{'site_name': 'M1', 'sequence_id': '(M)QIF', 'children': ''}, {'site_name': 'K6', 'sequence_id': 'IFV(K)TLT', 'children': ''}, {'site_name': 'K11', 'sequence_id': 'LTG(K)TIT', 'children': ''}, {'site_name': 'K27', 'sequence_id': 'ENV(K)AKI', 'children': ''}, {'site_name': 'K29', 'sequence_id': 'VKA(K)IQD', 'children': ''}, {'site_name': 'K33', 'sequence_id': 'IQD(K)EGI', 'children': ''}, {'site_name': 'K48', 'sequence_id': 'FAG(K)QLE', 'children': ''}, {'site_name': 'K63', 'sequence_id': 'NIQ(K)EST', 'children': 'ABOC'}]}",
  'ubi_his_JSON_dimer_formation': "{'protein': '1ubq', 'chain_number': 1, 'FASTA_sequence': 'MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQ

In [None]:


match_final_multimer_from_json(jsonOutput, ubiquitin_history)


































[1491]

































In [219]:
final_ubiquitin

{'protein': '1ubq',
 'chain_number': 1,
 'FASTA_sequence': 'MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGGDHHHHHH',
 'chain_length': 83,
 'branching_sites': [{'site_name': 'M1',
   'sequence_id': '(M)QIF',
   'children': ''},
  {'site_name': 'K6', 'sequence_id': 'IFV(K)TLT', 'children': ''},
  {'site_name': 'K11', 'sequence_id': 'LTG(K)TIT', 'children': ''},
  {'site_name': 'K27', 'sequence_id': 'ENV(K)AKI', 'children': ''},
  {'site_name': 'K29', 'sequence_id': 'VKA(K)IQD', 'children': ''},
  {'site_name': 'K33', 'sequence_id': 'IQD(K)EGI', 'children': ''},
  {'site_name': 'K48', 'sequence_id': 'FAG(K)QLE', 'children': ''},
  {'site_name': 'K63',
   'sequence_id': 'NIQ(K)EST',
   'children': {'protein': '1ubq',
    'chain_number': 2,
    'FASTA_sequence': 'MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG',
    'chain_length': 76,
    'branching_sites': [{'site_name': 'M1',
      'sequence_id': '(M)QIF',
      'children': ''},
 

In [221]:
ubiquitin_history[ubiquitin_history["final_multimer"] == str(final_ubiquitin)]

Unnamed: 0,index,multimer_id,used_in_synthesis,initial_acceptor,dimer_formation,dimer_deprotection,trimer_formation,trimer_deprotection,tetramer_formation,tetramer_deprotection,pentamer_formation,final_multimer
73,1491,Ub5_29,1,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
