In [22]:
import json 
import logging
import copy
import sys
import ast
import numpy as np
from pathlib import Path
import pandas as pd

# Dynamically get the backend path relative to the notebook location
notebook_path = Path().resolve()
project_root = notebook_path.parents[1]  # Go up to project root (adjust if needed)
sys.path.insert(0, str(project_root))
local_path = project_root / 'back_end'
sys.path.insert(0, str(local_path))

from src.utils.utils import *
from src.utils.logging_utils import *
from main import *

In [23]:
test_indexed_values_tetramers = [423, 427, 363, 31, 443, 447, 95, 143, 191, 315, 319, 279, 335, 339]
test_indexed_values_pentamers = [
    2035, 2039, 1975, 1655, 47, 2055, 2059, 1719, 111, 1735, 1739, 1039, 127, 871,
    2143, 2147, 2107, 1075, 2163, 2167, 431, 411, 875, 639, 643, 479, 787, 835,
    1491, 1495, 1475, 1307, 1511, 1515, 1311, 1223, 1271, 1599, 1603, 1563, 1619, 1623
]

In [24]:
# Download data from 
# /Users/ekummelstedt/le_code_base/ubiquitinformatics/back_end/data/filtered_reaction_database based one multimer size
multimer_size = 5  # Change this to 5 for pentamers

# download CSV files
def download_data(multimer_size):
    input_dir = project_root / 'back_end' / 'data' / 'filtered_reaction_database' / f'multimer_size_{multimer_size}'

    combined_database = pd.read_csv(input_dir / 'combined_database.csv', index_col=0)
    combined_database.reset_index('index', inplace=True)
    context_history = pd.read_csv(input_dir / 'context_history.csv', index_col=0)
    donor_history = pd.read_csv(input_dir / 'donor_history.csv', index_col=0)
    reaction_history = pd.read_csv(input_dir / 'reaction_history.csv', index_col=0)
    ubiquitin_history = pd.read_csv(input_dir / 'ubiquitin_history.csv', index_col=0)

    return {
        'combined_database': combined_database,
        'context_history': context_history,
        'donor_history': donor_history,
        'reaction_history': reaction_history,
        'ubiquitin_history': ubiquitin_history
    }

data_dict = download_data(multimer_size)
combined_database = data_dict['combined_database']
context_history = data_dict['context_history']
donor_history = data_dict['donor_history']
reaction_history = data_dict['reaction_history']
ubiquitin_history = data_dict['ubiquitin_history']

In [25]:
def filter_and_order_by_index_values(df, index_values, multimer_number, index_column='index'):
    """
    Filters and reorders a DataFrame based on a list of index values from a specified column.

    Parameters:
    ----------
    df : pd.DataFrame
        The input DataFrame to filter and reorder.
    index_values : list
        A list of values to retain and reorder by.
    index_column : str, default='index'
        The column name to match index values against.

    Returns:
    -------
    pd.DataFrame
        A filtered and ordered DataFrame.
    """
    if index_column not in df.columns:
        raise ValueError(f"Column '{index_column}' not found in DataFrame.")

    filtered = df[df[index_column].isin(index_values)]
    ordered = filtered.set_index(index_column).loc[index_values].reset_index()
    ordered["multimer_number"] = [f"Ub{multimer_size}_{i}" for i in range(1, len(ordered) + 1)]

    return ordered

# Select the appropriate indexed values based on the multimer size
if multimer_size == 4:
    test_indexed_values = test_indexed_values_tetramers
elif multimer_size == 5:
    test_indexed_values = test_indexed_values_pentamers

combined_database_synthesised = filter_and_order_by_index_values(combined_database, test_indexed_values, multimer_size, index_column='index')
context_history_synthesised = filter_and_order_by_index_values(context_history, test_indexed_values, multimer_size, index_column='index')
donor_history_synthesised = filter_and_order_by_index_values(donor_history, test_indexed_values, multimer_size, index_column='index')
reaction_history_synthesised = filter_and_order_by_index_values(reaction_history, test_indexed_values, multimer_size, index_column='index')
ubiquitin_history_synthesised = filter_and_order_by_index_values(ubiquitin_history, test_indexed_values, multimer_size, index_column='index')

In [28]:
multimer_numbering_df = ubiquitin_history_synthesised[['final_multimer', 'multimer_number']].copy()
merged_df = pd.merge(
    ubiquitin_history,
    multimer_numbering_df,
    on='final_multimer',
    how='left'  # or 'left' if you want to preserve all of ubiquitin_history
)
# create table with index, final_multimer, multimer_number
multimer_numbering_df = merged_df[['multimer_number', 'index', 'final_multimer']].copy()

# These are the tables that you will search and filter against with FAST_API
# They contain the index and the Ub5_number and whether there used in synthesis or not
# Save them in filtered_reaction_database
multimer_numbering_df

Unnamed: 0,multimer_number,index,final_multimer
0,Ub5_5,47,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
1,Ub5_5,71,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
2,Ub5_5,91,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
3,Ub5_9,111,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
4,Ub5_13,127,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
...,...,...,...
109,Ub5_17,2127,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
110,Ub5_15,2143,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
111,Ub5_16,2147,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."
112,Ub5_19,2163,"{'protein': '1ubq', 'chain_number': 1, 'FASTA_..."


In [27]:
# rank 1 if reaction was used in synthesis, 0 if not