# Process data from PDC assays
## Notes
DO 1-26-2026 I'm re-creating this file with the help of claude code

## Analysis plan
* Load the "Enzyme_assay_metadata" spreadsheet and identify the assays we want to process
* Find all of the .csv files with PDC enzyme assay data
* For each csv file:
  * Add filename information
  * Measure initial pyruvate
    * Determine the expected initial pyruvate concentration (Pyruvate_mM) and Blank_time_s from the Enzyme_assay_metadata dataframe
    * Calculate the pyruvate concentration using the _calculate_blank_pyruvate() function imported from the "Compiling_spectrum_data.ipynb" notebook in the "Spectrum files from Agilent spec" folder
    * If the difference from the expected pyruvate concentration is >50%, throw a warning and use the expected pyruvate concentration instead (note, it might make senese to update this in the _calculate_blank_pyruvate() function
  * Measure NADH concentration
    * use the process_pdc_timecourse() function

* Combine the data into a single pandas dataframe for plotting
* Plot NADH concentration vs. offset time (i.e. where the assay start time has been shifted to zero) for all samples. This will allow us to do a rough examination of the data

* Data processing for subsequent analysis:
  * For each assay, measure the maximum slope (V), after the assay start.
  * Normalize V to the enzyme concentration (V/E)

* Determine the effect of Adh enzyme concentration
  * Select only the "Varying Adh" assay group
  * Plot V/E vs. the Adh concentration

* Create a kcat plot
  * Plot V/E vs. the substrate concentration
  * Adjust the units so that we can measure kcat directly from the plot
  * Color by filename

* Measure NADH degradation (see if we have good enough data for this)

* Convert to an EnzymeML file
* Upload EnzymeML file, colab notebook, and raw data to Janis Shin's github folder for subsequent modeling.




In [17]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import plotly.graph_objects as go
from pathlib import Path
import sys

# Add parent directory to path to import pda modules
parent_path = str(Path.cwd().parent.parent)
if parent_path not in sys.path:
    sys.path.insert(0, parent_path)

from pda.data_io import load_kinetic_data
from pda.spectral import calculate_concentrations
from pda.timecourse import process_pdc_timecourse

print("Setup complete! Imported all required modules.")
print("Autoreload enabled - modules will be reloaded automatically.")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Setup complete! Imported all required modules.
Autoreload enabled - modules will be reloaded automatically.


In [18]:
# Load metadata
metadata_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRVpwYqImFkaUigsWgrO9MRtWjYWwps82EExnomLqNr_hOUNViKF_fFyAhJfIqe3hDq0IEG76W4v_fO/pub?output=csv"
metadata_df = pd.read_csv(metadata_url)

# Load standards
standards_df = pd.read_csv("../spectra_data/NADH_Pyruvate_Standards.csv")

print(f"Loaded {len(metadata_df)} assays from metadata")
print(f"Loaded {len(standards_df)} wavelength points from standards")

Loaded 129 assays from metadata
Loaded 858 wavelength points from standards


In [19]:
# Filter for PDC forward assays that are not flagged to ignore
pdc_assays = metadata_df[
    (metadata_df['Assay'] == 'PDC_fwd') & 
    (metadata_df['Ignore'].isna())
].copy()

print(f"Found {len(pdc_assays)} PDC assays to process")
print(f"Assay groups: {pdc_assays['Assay Group'].unique()}")
print(f"Unique files: {pdc_assays['Filename'].nunique()}")

Found 96 PDC assays to process
Assay groups: ['Varying Adh' 'Varying pyr low NADH' 'Varying pyr high NADH']
Unique files: 32


In [20]:
# Create mapping from metadata filenames (.KD) to actual CSV files
assay_data_path = Path("../assay_data")

# Add CSV filename column
pdc_assays['csv_filename'] = pdc_assays['Filename'].str.replace('.KD', '.csv', regex=False)

# Check which files exist
pdc_assays['csv_exists'] = pdc_assays['csv_filename'].apply(
    lambda x: (assay_data_path / x).exists()
)

print(f"Total PDC assays: {len(pdc_assays)}")
print(f"CSV files found: {pdc_assays['csv_exists'].sum()}")
print(f"CSV files missing: {(~pdc_assays['csv_exists']).sum()}")

if (~pdc_assays['csv_exists']).any():
    print("Missing files:")
    print(pdc_assays[~pdc_assays['csv_exists']]['csv_filename'].unique())

# Filter to only assays with existing CSV files
pdc_assays = pdc_assays[pdc_assays['csv_exists']].copy()
print(f"Processing {len(pdc_assays)} assays with available CSV files")

Total PDC assays: 96
CSV files found: 66
CSV files missing: 30
Missing files:
['0116 1600 800 400MM PYR-1.csv' '0116 200 100 40MM PYR-2.csv'
 '0116 20 16 8 4MM PYR-4.csv' '0120 1600 800 400MM PYR-1.csv'
 '0120 200 100 40MM PYR-2.csv' '0120 20 16 8 4MM PYR-3.csv'
 '0121 1600 800 400MM PYR-1.csv' '0121 200 100 40MM PYR-2.csv'
 '0121 20 16 8 4MM PYR-3.csv']
Processing 66 assays with available CSV files


## Debugging one dataset

In [None]:
# DEBUG: Test processing of a specific dataset
import traceback

# Get a specific assay from metadata for testing
test_assay = pdc_assays[
    (pdc_assays['csv_filename'] == '1222 PDC-9.csv') & 
    (pdc_assays['Cuvette'] == 'CELL_2')
].iloc[0]

print(f"Testing: {test_assay['csv_filename']} - {test_assay['Cuvette']}")
print(f"Pyruvate: {test_assay['Pyruvate_mM']} mM")
print(f"Start: {test_assay['Start_time_s']} s")
print(f"Blank: {test_assay['Blank_time_s']} s")
print()

try:
    # Load the data
    csv_path = assay_data_path / test_assay['csv_filename']
    print(f"Loading from: {csv_path}")
    spectral_df = load_kinetic_data(str(csv_path), sample_filter=test_assay['Cuvette'])
    
    print(f"Loaded dataframe shape: {spectral_df.shape}")
    print(f"First few rows:")
    print(spectral_df.head())
    print()
    
    # Process timecourse
    print("Calling process_pdc_timecourse...")
    results = process_pdc_timecourse(
        spectral_df=spectral_df,
        standards_df=standards_df,
        assay_start_time=test_assay['Start_time_s'],
        blank_time=test_assay['Blank_time_s'],
        initial_pyruvate_mM=test_assay['Pyruvate_mM'],
        method='constrained',
        wavelength_range=(320, 420),
        absorbance_max=2,
        plot=True,
        verbose=True  # Turn on verbose for debugging
    )
    
    print(f"\n✓ Success! Processed {len(results)} time points")
    print(results.head())
    
except Exception as e:
    print(f"ERROR: {e}")
    print(f"Full traceback:")
    traceback.print_exc()

In [None]:
# Process each PDC assay using process_pdc_timecourse()
import traceback

all_results = []
errors = []

for idx, assay in pdc_assays.iterrows():
    csv_path = assay_data_path / assay['csv_filename']
    
    print(f"Processing: {assay['csv_filename']} - {assay['Cuvette']}")
    print(f"  Pyruvate: {assay['Pyruvate_mM']} mM, Start: {assay['Start_time_s']} s, Blank: {assay['Blank_time_s']} s")
    
    try:
        # Load CSV file
        spectral_df = load_kinetic_data(str(csv_path), sample_filter=assay['Cuvette'])
        
        print(f"  Loaded {len(spectral_df)} rows for {assay['Cuvette']}")
        
        if len(spectral_df) == 0:
            msg = f"No data found for {assay['Cuvette']}"
            print(f"  WARNING: {msg}")
            errors.append({
                'filename': assay['csv_filename'],
                'cuvette': assay['Cuvette'],
                'error': msg
            })
            continue
        
        # Process timecourse
        results = process_pdc_timecourse(
            spectral_df=spectral_df,
            standards_df=standards_df,
            assay_start_time=assay['Start_time_s'],
            blank_time=assay['Blank_time_s'],
            initial_pyruvate_mM=assay['Pyruvate_mM'],
            method='constrained',
            wavelength_range=(320, 420),
            absorbance_max=2,
            plot=True,
            verbose=False
        )
        
        # Add metadata columns
        results['Filename'] = assay['Filename']
        results['csv_filename'] = assay['csv_filename']
        results['Cuvette'] = assay['Cuvette']
        results['Assay_Group'] = assay['Assay Group']
        results['Pdc_ug_ml'] = assay['Pdc_ug_ml']
        results['Adh_ug_ml'] = assay['Adh_ug_ml']
        
        all_results.append(results)
        
        print(f"  ✓ Processed {len(results)} time points")
        
    except Exception as e:
        msg = str(e)
        print(f"  ERROR: {msg}")
        print(f"  Full traceback:")
        traceback.print_exc()
        errors.append({
            'filename': assay['csv_filename'],
            'cuvette': assay['Cuvette'],
            'error': msg
        })
        continue

# Combine all results
if len(all_results) > 0:
    combined_df = pd.concat(all_results, ignore_index=True)
    print(f"{'='*60}")
    print(f"Successfully processed {len(all_results)} assays")
    print(f"Total time points: {len(combined_df)}")
    if len(errors) > 0:
        print(f"Errors encountered: {len(errors)}")
    print(f"{'='*60}")
else:
    print("No results to combine!")
    combined_df = pd.DataFrame()

# Show error summary if any
if len(errors) > 0:
    print(f"{'='*60}")
    print("ERROR SUMMARY:")
    print(f"{'='*60}")
    errors_df = pd.DataFrame(errors)
    print(errors_df.to_string())


In [None]:
# Plot NADH concentration vs. time for all assays
combined_df['Assay_ID'] = combined_df['Cuvette'] + '_' + combined_df['csv_filename']

fig = go.Figure()

for assay_id in combined_df['Assay_ID'].unique():
    data = combined_df[combined_df['Assay_ID'] == assay_id]
    
    fig.add_trace(go.Scatter(
        x=data['Time_Relative_s'],
        y=data['NADH_mM'],
        mode='lines',
        name=assay_id,
        showlegend=True
    ))

fig.update_layout(
    title='NADH Concentration vs. Time (All PDC Assays)',
    xaxis_title='Time from Assay Start (s)',
    yaxis_title='NADH Concentration (mM)',
    height=600,
    hovermode='closest'
)

fig.show()

print(f"Plotted {combined_df['Assay_ID'].nunique()} assays")

In [None]:
# Calculate V (maximum slope) and V/E for each assay
from scipy.stats import linregress

# Time window for initial rate calculation (seconds after assay start)
RATE_WINDOW_START = 0
RATE_WINDOW_END = 50

kinetic_results = []

for assay_id in combined_df['Assay_ID'].unique():
    assay_data = combined_df[combined_df['Assay_ID'] == assay_id].copy()
    
    # Filter to rate window
    rate_data = assay_data[
        (assay_data['Time_Relative_s'] >= RATE_WINDOW_START) &
        (assay_data['Time_Relative_s'] <= RATE_WINDOW_END)
    ]
    
    if len(rate_data) < 3:
        print(f"Warning: Not enough data points for {assay_id}")
        continue
    
    # Calculate slope using linear regression
    slope, intercept, r_value, p_value, std_err = linregress(
        rate_data['Time_Relative_s'],
        rate_data['NADH_mM']
    )
    
    # Get metadata
    metadata = assay_data.iloc[0]
    pdc_conc = metadata['Pdc_ug_ml']
    
    # Calculate V/E (normalize by enzyme concentration)
    if pd.notna(pdc_conc) and pdc_conc > 0:
        v_over_e = slope / pdc_conc
    else:
        v_over_e = np.nan
    
    kinetic_results.append({
        'Assay_ID': assay_id,
        'Filename': metadata['Filename'],
        'csv_filename': metadata['csv_filename'],
        'Cuvette': metadata['Cuvette'],
        'Assay_Group': metadata['Assay_Group'],
        'Pdc_ug_ml': pdc_conc,
        'Adh_ug_ml': metadata['Adh_ug_ml'],
        'Pyruvate_mM': metadata['Pyruvate_mM'],
        'V_mM_per_s': slope,
        'V_over_E': v_over_e,
        'Intercept_mM': intercept,
        'R_squared': r_value**2,
        'n_points': len(rate_data)
    })

kinetics_df = pd.DataFrame(kinetic_results)

print(f"Calculated kinetics for {len(kinetics_df)} assays")
print(f"
Summary statistics for V/E:")
print(kinetics_df['V_over_E'].describe())
print(f"
Assay groups: {kinetics_df['Assay_Group'].unique()}")

In [None]:
# Plot individual assay timecourses with regression lines
n_plots = min(6, len(kinetics_df))
sample_assays = kinetics_df.head(n_plots)

for idx, row in sample_assays.iterrows():
    assay_id = row['Assay_ID']
    assay_data = combined_df[combined_df['Assay_ID'] == assay_id]
    
    # Create regression line points
    x_line = np.array([RATE_WINDOW_START, RATE_WINDOW_END])
    y_line = row['V_mM_per_s'] * x_line + row['Intercept_mM']
    
    # Create plot
    fig = go.Figure()
    
    # Add full timecourse
    fig.add_trace(go.Scatter(
        x=assay_data['Time_Relative_s'],
        y=assay_data['NADH_mM'],
        mode='markers',
        name='Data',
        marker=dict(size=4, color='blue')
    ))
    
    # Add regression line
    fig.add_trace(go.Scatter(
        x=x_line,
        y=y_line,
        mode='lines',
        name=f'Linear fit (V/E = {row["V_over_E"]:.6f})',
        line=dict(color='red', width=2)
    ))
    
    # Add shaded region for rate window
    fig.add_vrect(
        x0=RATE_WINDOW_START,
        x1=RATE_WINDOW_END,
        fillcolor='lightgray',
        opacity=0.2,
        line_width=0,
        annotation_text="Rate window",
        annotation_position="top left"
    )
    
    fig.update_layout(
        title=f'{row["Cuvette"]} - {row["csv_filename"]}<br>' + 
              f'V = {row["V_mM_per_s"]:.6f} mM/s, ' +
              f'V/E = {row["V_over_E"]:.6f} mM/s/(μg/mL), ' +
              f'R² = {row["R_squared"]:.4f}',
        xaxis_title='Time from Assay Start (s)',
        yaxis_title='NADH Concentration (mM)',
        height=400,
        showlegend=True
    )
    
    fig.show()

In [None]:
# Plot V/E vs. ADH concentration for "Varying Adh" assay group
varying_adh = kinetics_df[kinetics_df['Assay_Group'] == 'Varying Adh'].copy()

if len(varying_adh) > 0:
    fig = go.Figure()
    
    # Add scatter points
    fig.add_trace(go.Scatter(
        x=varying_adh['Adh_ug_ml'],
        y=varying_adh['V_over_E'],
        mode='markers',
        marker=dict(
            size=10,
            color=varying_adh['R_squared'],
            colorscale='Viridis',
            showscale=True,
            colorbar=dict(title='R²'),
            line=dict(width=1, color='black')
        ),
        text=[f"{row['Cuvette']}<br>{row['csv_filename']}<br>R²={row['R_squared']:.3f}" 
              for _, row in varying_adh.iterrows()],
        hovertemplate='ADH: %{x:.3f} μg/mL<br>V/E: %{y:.6f}<br>%{text}<extra></extra>'
    ))
    
    fig.update_layout(
        title='Effect of ADH Concentration on PDC Reaction Rate<br>(Varying Adh Assay Group)',
        xaxis_title='ADH Concentration (μg/mL)',
        yaxis_title='V/E (mM/s per μg/mL PDC)',
        height=500,
        showlegend=False
    )
    
    fig.show()
    
    print(f"
Varying Adh group statistics:")
    print(f"  Number of assays: {len(varying_adh)}")
    print(f"  ADH range: {varying_adh['Adh_ug_ml'].min():.3f} - {varying_adh['Adh_ug_ml'].max():.3f} μg/mL")
    print(f"  V/E range: {varying_adh['V_over_E'].min():.6f} - {varying_adh['V_over_E'].max():.6f}")
else:
    print("No 'Varying Adh' assays found in the dataset")