In [None]:
import json 
import logging
import copy
import sys
import ast
import numpy as np
from pathlib import Path
import pandas as pd

# Dynamically get the backend path relative to the notebook location
notebook_path = Path().resolve()
project_root = notebook_path.parents[1]  # Go up to project root (adjust if needed)
sys.path.insert(0, str(project_root))
local_path = project_root / 'back_end'
sys.path.insert(0, str(local_path))

from src.utils.utils import *
from src.utils.logging_utils import *
from main import *

# download CSV files
def download_filtered_data(multimer_size):
    input_dir = project_root / 'back_end' / 'data' / 'filtered_reaction_database' / f'multimer_size_{multimer_size}'

    combined_database = pd.read_csv(input_dir / 'combined_database.csv', index_col=0)
    context_history = pd.read_csv(input_dir / 'context_history.csv', index_col=0)
    donor_history = pd.read_csv(input_dir / 'donor_history.csv', index_col=0)
    reaction_history = pd.read_csv(input_dir / 'reaction_history.csv', index_col=0)
    ubiquitin_history = pd.read_csv(input_dir / 'ubiquitin_history.csv', index_col=0)

    return {
        'combined_database': combined_database,
        'context_history': context_history,
        'donor_history': donor_history,
        'reaction_history': reaction_history,
        'ubiquitin_history': ubiquitin_history
    }

data_dict = download_filtered_data(multimer_size)
combined_database = data_dict['combined_database']
context_history = data_dict['context_history']
donor_history = data_dict['donor_history']
reaction_history = data_dict['reaction_history']
ubiquitin_history = data_dict['ubiquitin_history']

In [31]:
# download CSV files
def download_filtered_data(multimer_size):
    input_dir = project_root / 'back_end' / 'data' / 'filtered_reaction_database' / f'multimer_size_{multimer_size}'

    combined_database = pd.read_csv(input_dir / 'combined_database.csv', index_col=0)
    context_history = pd.read_csv(input_dir / 'context_history.csv', index_col=0)
    donor_history = pd.read_csv(input_dir / 'donor_history.csv', index_col=0)
    reaction_history = pd.read_csv(input_dir / 'reaction_history.csv', index_col=0)
    ubiquitin_history = pd.read_csv(input_dir / 'ubiquitin_history.csv', index_col=0)

    return {
        'combined_database': combined_database,
        'context_history': context_history,
        'donor_history': donor_history,
        'reaction_history': reaction_history,
        'ubiquitin_history': ubiquitin_history
    }

def assign_ubiquitin_ids(df, index_values, multimer_size):
    """
    Filters and reorders a DataFrame based on a list of index values from a specified column.

    Parameters:
    ----------
    df : pd.DataFrame
        The input DataFrame to filter and reorder.
    index_values : list
        A list of values to retain and reorder by.

    Returns:
    -------
    pd.DataFrame
        A filtered and ordered DataFrame.
    """
    if 'index' not in df.columns:
        raise ValueError(f"Column '{'index'}' not found in DataFrame.")

    filtered = df[df['index'].isin(index_values)]
    ordered = filtered.set_index('index').loc[index_values].reset_index()
    ordered["multimer_id"] = [f"Ub{multimer_size}_{i}" for i in range(1, len(ordered) + 1)]

    return ordered

def merge_multimer_id(df, multimer_numbering_df, indexed_values):
    """
    Merges a DataFrame with the multimer numbering data on 'final_multimer',
    and adds a 'used in synthesis' column based on indexed values and multimer level.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame to merge.
    multimer_numbering_df : pd.DataFrame
        A DataFrame with 'final_multimer' and 'multimer_id' columns.
    indexed_values : list of int
        List of indices used in synthesis (either for tetramers or pentamers).

    Returns
    -------
    pd.DataFrame
        The merged and annotated DataFrame with reordered columns.
    """
    # Merge on 'final_multimer'
    merged = pd.merge(df, multimer_numbering_df, on='index', how='left')

    # Ensure multimer_id is treated as string
    merged['multimer_id'] = merged['multimer_id'].astype(str)

    # Determine expected multimer number from input list length
    if len(indexed_values) == 14:
        expected_multimer_id = '4'
    elif len(indexed_values) == 42:
        expected_multimer_id = '5'
    else:
        raise ValueError("indexed_values must be length 14 (tetramers) or 42 (pentamers)")

    return merged

def mark_used_in_synthesis_by_index(merged: pd.DataFrame, indexed_values: list) -> pd.DataFrame:
    """
    Marks rows in a DataFrame as 'used in synthesis' based on index membership.

    Parameters:
    ----------
    merged : pd.DataFrame
        DataFrame containing an 'index' column.

    indexed_values : list
        List of index values to mark as used in synthesis.

    Returns:
    -------
    pd.DataFrame
        The DataFrame with an added 'used_in_synthesis' column where
        rows with matching indices are marked with 1 and others with 0.
    """
    # Build indicator DataFrame from indexed_values
    synthesis_flags = pd.DataFrame({
        'index': indexed_values,
        'used_in_synthesis': 1
    })

    # Ensure 'index' column is int in both
    merged['index'] = merged['index'].astype(int)
    synthesis_flags['index'] = synthesis_flags['index'].astype(int)

    # Merge on 'index' and fill NaNs with 0
    merged = pd.merge(merged, synthesis_flags, on='index', how='left')
    merged['used_in_synthesis'] = merged['used_in_synthesis'].fillna(0).astype(int)

    # Reorder columns
    cols = merged.columns.tolist()
    reordered_cols = ['index', 'multimer_id', 'used_in_synthesis'] + [col for col in cols if col not in ['index', 'multimer_id', 'used_in_synthesis']]

    return merged[reordered_cols]

In [32]:
# INPUTS
# test_indexed_values_tetramers
# test_indexed_values_pentamers

# files... probably copy inside the funciton 

# download_filtered_data
test_indexed_values_tetramers = [423, 427, 363, 31, 443, 447, 95, 143, 191, 315, 319, 279, 335, 339]
test_indexed_values_pentamers = [
    2035, 2039, 1975, 1655, 47, 2055, 2059, 1719, 111, 1735, 1739, 1039, 127, 871,
    2143, 2147, 2107, 1075, 2163, 2167, 431, 411, 875, 639, 643, 479, 787, 835,
    1491, 1495, 1475, 1307, 1511, 1515, 1311, 1223, 1271, 1599, 1603, 1563, 1619, 1623
]

# Download data from 
# /Users/ekummelstedt/le_code_base/ubiquitinformatics/back_end/data/filtered_reaction_database based one multimer size
multimer_size = 5  # Change this to 5 for pentamers

data_dict = download_filtered_data(multimer_size)
combined_database = data_dict['combined_database']
context_history = data_dict['context_history']
donor_history = data_dict['donor_history']
reaction_history = data_dict['reaction_history']
ubiquitin_history = data_dict['ubiquitin_history']

# Select the appropriate indexed values based on the multimer size
if multimer_size == 4:
    test_indexed_values = test_indexed_values_tetramers
elif multimer_size == 5:
    test_indexed_values = test_indexed_values_pentamers

# Assign ubiquitin IDs to the multimers
ubiquitin_history_synthesised = assign_ubiquitin_ids(ubiquitin_history, test_indexed_values, multimer_size)

# Step 1: Extract the multimer_numbering_df
# Very important step as it contains the multimer_id for each final_multimer
multimer_numbering_synthesised_df = ubiquitin_history_synthesised[['final_multimer', 'multimer_id']].copy()
# Merge on 'final_multimer'
merged = pd.merge(ubiquitin_history, multimer_numbering_synthesised_df, on='final_multimer', how='left')
# Create a DataFrame with 'index' and 'multimer_id'
multimer_numbering_df = merged[['index', 'multimer_id']]

# Now apply to ubiquitin_history_synthesised to get the indexed values matched to multimer_id for every index
indexed_values = test_indexed_values_tetramers if multimer_size == 4 else test_indexed_values_pentamers

# Step 2: Apply to all target DataFrames
combined_database = merge_multimer_id(combined_database, multimer_numbering_df, indexed_values)
context_history = merge_multimer_id(context_history, multimer_numbering_df, indexed_values)
donor_history = merge_multimer_id(donor_history, multimer_numbering_df, indexed_values)
reaction_history = merge_multimer_id(reaction_history, multimer_numbering_df, indexed_values)
ubiquitin_history = merge_multimer_id(ubiquitin_history, multimer_numbering_df, indexed_values)

combined_database = mark_used_in_synthesis_by_index(combined_database, indexed_values)
context_history = mark_used_in_synthesis_by_index(context_history, indexed_values)
donor_history = mark_used_in_synthesis_by_index(donor_history, indexed_values)
reaction_history = mark_used_in_synthesis_by_index(reaction_history, indexed_values)
ubiquitin_history = mark_used_in_synthesis_by_index(ubiquitin_history, indexed_values)



In [33]:
ubiquitin_history

Unnamed: 0,index,multimer_id,used_in_synthesis,initial_acceptor,dimer_formation,dimer_deprotection,trimer_formation,trimer_deprotection,tetramer_formation,tetramer_deprotection,pentamer_formation,final_multimer
0,47,Ub5_5,1,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
1,71,Ub5_5,0,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
2,91,Ub5_5,0,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
3,111,Ub5_9,1,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
4,127,Ub5_13,1,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
...,...,...,...,...,...,...,...,...,...,...,...,...
109,2127,Ub5_17,0,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
110,2143,Ub5_15,1,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
111,2147,Ub5_16,1,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
112,2163,Ub5_19,1,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_...","{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."


In [34]:
def get_most_frequent_value(df, column_name):
    """
    Returns the most frequent value and its count from a specified column.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame.
    column_name : str
        The column to evaluate.

    Returns
    -------
    tuple
        (most_frequent_value, count)
    """
    value_counts = df[column_name].value_counts()
    
    if value_counts.empty:
        return None, 0

    most_common_value = value_counts.idxmax()
    most_common_count = value_counts.max()

    return most_common_value, most_common_count

In [35]:
get_most_frequent_value(ubiquitin_history, 'multimer_id')

('Ub5_14', np.int64(12))

In [36]:
df = combined_database[combined_database['multimer_id'] == 'Ub5_14']
df = df.sort_values(by="index").reset_index(drop=True)

# Save the synthesised DataFrames to CSV files
df.to_csv('Ub5_14.csv', index=False)