In [3]:
import json 
import logging
import copy
import sys
import ast
import numpy as np
from pathlib import Path
import pandas as pd

# Dynamically get the backend path relative to the notebook location
notebook_path = Path().resolve()
project_root = notebook_path.parents[1]  # Go up to project root (adjust if needed)
sys.path.insert(0, str(project_root))
local_path = project_root / 'back_end'
sys.path.insert(0, str(local_path))

from src.utils.utils import *
from src.utils.logging_utils import *
from main import *

multimer_size = 5

def validate_confirmation_data(confirmation_df, validation_df):
    """
    Validate confirmation data against the validation dataset.
    Args:
        confirmation_df (pd.DataFrame): DataFrame containing confirmation data.
        validation_df (pd.DataFrame): DataFrame containing validation data.
    Returns:
        pd.DataFrame: DataFrame with validation results.
    """
    # Ensure both DataFrames have the same columns
    confirmation_df = confirmation_df.reindex(columns=validation_df.columns, fill_value=np.nan)
    
    # For exact row match (robust):
    matches = confirmation_df.merge(validation_df.drop_duplicates(), how='left', indicator=True)
    confirmation_df['is_validated'] = matches['_merge'] == 'both'

    # Step 2: Filter mismatches (optional)
    mismatches = confirmation_df[~confirmation_df['is_validated']]

    return mismatches

# download CSV files
def validate_data(multimer_size):
    """
    Validate the data by loading the combined database and ubiquitin history.
    Args:
        multimer_size (int): Size of the multimer to validate.
    Returns:
        dict: Dictionary containing the combined database and ubiquitin history.
    """

    input_dir = project_root / 'back_end' / 'data' / 'filtered_reaction_database' / f'multimer_size_{multimer_size}'

    # Load the combined database and ubiquitin history
    combined_database = pd.read_csv(input_dir / 'combined_database.csv', index_col=0)
    ubiquitin_history = pd.read_csv(input_dir / 'ubiquitin_history.csv', index_col=0)

    confirmation_dir = project_root / 'back_end' / 'src' / 'confirmation_data' 

    # Load the confirmation data
    confirmation_ubiquitin_history = pd.read_csv(confirmation_dir / f'multimer_size_{multimer_size}_multimer_database.csv')
    confirmation_combined_database = pd.read_csv(confirmation_dir / f'multimer_size_{multimer_size}_reaction_database.csv')

    # Take only the synthesis reactions
    validation_combined_database = combined_database[combined_database['used_in_synthesis']==1]
    validation_ubiquitin_history = ubiquitin_history[ubiquitin_history['used_in_synthesis']==1]

    # Filter columns to match the confirmation database
    checking_columns_ubiquitin_history = confirmation_ubiquitin_history.columns
    validation_ubiquitin_history = validation_ubiquitin_history[checking_columns_ubiquitin_history]

    # Filter columns to match the confirmation database
    checking_columns_combined_database = confirmation_combined_database.columns
    validation_combined_database = validation_combined_database[checking_columns_combined_database]

    # Validate the confirmation data against the validation dataset
    mismatched_ubiquitin_history = validate_confirmation_data(
        confirmation_ubiquitin_history, validation_ubiquitin_history
        )

    # Validate the confirmation combined database against the validation dataset
    mismatched_combined_database = validate_confirmation_data(
        confirmation_combined_database, validation_combined_database
        )

    return mismatched_ubiquitin_history, mismatched_combined_database

mismatched_ubiquitin_history, mismatched_combined_database = validate_data(multimer_size)

# Raise error if mismatched_ubiquitin_history.empty is not True
if not mismatched_ubiquitin_history.empty:
    logging.error("Mismatched ubiquitin history found:")
    logging.error(mismatched_ubiquitin_history)
    raise ValueError("Mismatched ubiquitin history found.")

if not mismatched_combined_database.empty:  
    logging.error("Mismatched combined database found:")
    logging.error(mismatched_combined_database)
    raise ValueError("Mismatched combined database found.")

In [4]:
# final validation against confirmation database
combined_database

NameError: name 'combined_database' is not defined

In [None]:
342/3

114.0

In [None]:
sorted_combined_database

Unnamed: 0,index,multimer_id,used_in_synthesis,table_origin,initial_acceptor,dimer_formation,dimer_deprotection,trimer_formation,trimer_deprotection,tetramer_formation,tetramer_deprotection,pentamer_formation,final_multimer,multimer_id_1,multimer_id_2
331,2035,Ub5_1,1,Acceptor,histag_ubi_ubq_1_K63_aboc,,,,,,,,,Ub5,1
103,2035,Ub5_1,1,Donors,,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_ABOC,,Ub5,1
217,2035,Ub5_1,1,Reactions,,gp78/Ube2g2,SMAC_deprot,gp78/Ube2g2,SMAC_deprot,gp78/Ube2g2,SMAC_deprot,gp78/Ube2g2,,Ub5,1
332,2039,Ub5_2,1,Acceptor,histag_ubi_ubq_1_K63_aboc,,,,,,,,,Ub5,2
104,2039,Ub5_2,1,Donors,,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_SMAC,,ubi_ubq_1_K48_ABOC_K63_ABOC,,Ub5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,1619,Ub5_41,1,Donors,,ubi_ubq_1_K48_ABOC_K63_SMAC,,ubi_ubq_1_K48_ABOC_K63_SMAC,,ubi_ubq_1_K48_SMAC_K63_ABOC,,ubi_ubq_1_K48_ABOC_K63_ABOC,,Ub5,41
196,1619,Ub5_41,1,Reactions,,Ubc13/Mms2,SMAC_deprot,Ubc13/Mms2,SMAC_deprot,Ubc13/Mms2,SMAC_deprot,Ube2K,,Ub5,41
311,1623,Ub5_42,1,Acceptor,histag_ubi_ubq_1_K48_aboc,,,,,,,,,Ub5,42
83,1623,Ub5_42,1,Donors,,ubi_ubq_1_K48_ABOC_K63_SMAC,,ubi_ubq_1_K48_ABOC_K63_SMAC,,ubi_ubq_1_K48_ABOC_K63_SMAC,,ubi_ubq_1_K48_ABOC_K63_ABOC,,Ub5,42


In [None]:
validation_combined_database['tetramer_formation'].unique()

array(['ubi_ubq_1_K48_ABOC_K63_ABOC', 'ubi_ubq_1_K48_SMAC_K63_ABOC',
       'ubi_ubq_1_K48_ABOC_K63_SMAC', 'gp78/Ube2g2', 'Ubc13/Mms2',
       'Ube2K', nan], dtype=object)

In [None]:
t

NameError: name 't' is not defined