# Process data from PDC assays
## Notes
DO 1-26-2026 I'm re-creating this file with the help of claude code

## Analysis plan
* Load the "Enzyme_assay_metadata" spreadsheet and identify the assays we want to process
* Find all of the .csv files with PDC enzyme assay data
* For each csv file:
  * Add filename information
  * Measure initial pyruvate
    * Determine the expected initial pyruvate concentration (Pyruvate_mM) and Blank_time_s from the Enzyme_assay_metadata dataframe
    * Calculate the pyruvate concentration using the _calculate_blank_pyruvate() function imported from the "Compiling_spectrum_data.ipynb" notebook in the "Spectrum files from Agilent spec" folder
    * If the difference from the expected pyruvate concentration is >50%, throw a warning and use the expected pyruvate concentration instead (note, it might make senese to update this in the _calculate_blank_pyruvate() function
  * Measure NADH concentration
    * use the process_pdc_timecourse() function

* Combine the data into a single pandas dataframe for plotting
* Plot NADH concentration vs. offset time (i.e. where the assay start time has been shifted to zero) for all samples. This will allow us to do a rough examination of the data

* Data processing for subsequent analysis:
  * For each assay, measure the maximum slope (V), after the assay start.
  * Normalize V to the enzyme concentration (V/E)

* Determine the effect of Adh enzyme concentration
  * Select only the "Varying Adh" assay group
  * Plot V/E vs. the Adh concentration

* Create a kcat plot
  * Plot V/E vs. the substrate concentration
  * Adjust the units so that we can measure kcat directly from the plot
  * Color by filename

* Measure NADH degradation (see if we have good enough data for this)

* Convert to an EnzymeML file
* Upload EnzymeML file, colab notebook, and raw data to Janis Shin's github folder for subsequent modeling.




In [28]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import plotly.graph_objects as go
from pathlib import Path
import sys

# Add parent directory to path to import pda modules
parent_path = str(Path.cwd().parent.parent)
if parent_path not in sys.path:
    sys.path.insert(0, parent_path)

from pda.data_io import load_kinetic_data
from pda.spectral import calculate_concentrations
from pda.timecourse import process_pdc_timecourse

print("Setup complete! Imported all required modules.")
print("Autoreload enabled - modules will be reloaded automatically.")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Setup complete! Imported all required modules.
Autoreload enabled - modules will be reloaded automatically.


In [29]:
# Load metadata
metadata_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRVpwYqImFkaUigsWgrO9MRtWjYWwps82EExnomLqNr_hOUNViKF_fFyAhJfIqe3hDq0IEG76W4v_fO/pub?output=csv"
metadata_df = pd.read_csv(metadata_url)

# Load standards
standards_df = pd.read_csv("../spectra_data/NADH_Pyruvate_Standards.csv")

print(f"Loaded {len(metadata_df)} assays from metadata")
print(f"Loaded {len(standards_df)} wavelength points from standards")

Loaded 129 assays from metadata
Loaded 858 wavelength points from standards


In [30]:
# Filter for PDC forward assays that are not flagged to ignore
pdc_assays = metadata_df[
    (metadata_df['Assay'] == 'PDC_fwd') & 
    (metadata_df['Ignore'].isna())
].copy()

print(f"Found {len(pdc_assays)} PDC assays to process")
print(f"Assay groups: {pdc_assays['Assay Group'].unique()}")
print(f"Unique files: {pdc_assays['Filename'].nunique()}")

Found 96 PDC assays to process
Assay groups: ['Varying Adh' 'Varying pyr low NADH' 'Varying pyr high NADH']
Unique files: 32


In [31]:
# Create mapping from metadata filenames (.KD) to actual CSV files
assay_data_path = Path("../assay_data")

# Add CSV filename column
pdc_assays['csv_filename'] = pdc_assays['Filename'].str.replace('.KD', '.csv', regex=False)

# Check which files exist
pdc_assays['csv_exists'] = pdc_assays['csv_filename'].apply(
    lambda x: (assay_data_path / x).exists()
)

print(f"Total PDC assays: {len(pdc_assays)}")
print(f"CSV files found: {pdc_assays['csv_exists'].sum()}")
print(f"CSV files missing: {(~pdc_assays['csv_exists']).sum()}")

if (~pdc_assays['csv_exists']).any():
    print("Missing files:")
    print(pdc_assays[~pdc_assays['csv_exists']]['csv_filename'].unique())

# Filter to only assays with existing CSV files
pdc_assays = pdc_assays[pdc_assays['csv_exists']].copy()
print(f"Processing {len(pdc_assays)} assays with available CSV files")

Total PDC assays: 96
CSV files found: 66
CSV files missing: 30
Missing files:
['0116 1600 800 400MM PYR-1.csv' '0116 200 100 40MM PYR-2.csv'
 '0116 20 16 8 4MM PYR-4.csv' '0120 1600 800 400MM PYR-1.csv'
 '0120 200 100 40MM PYR-2.csv' '0120 20 16 8 4MM PYR-3.csv'
 '0121 1600 800 400MM PYR-1.csv' '0121 200 100 40MM PYR-2.csv'
 '0121 20 16 8 4MM PYR-3.csv']
Processing 66 assays with available CSV files


In [32]:
# Check available columns in metadata
print("Available columns in pdc_assays:")
print(list(pdc_assays.columns))
print("\nSample of data:")
print(pdc_assays.head())

Available columns in pdc_assays:
['Experiment_ID', 'Ignore', 'Filename', 'Assay', 'Assay Group', 'Cuvette', 'Start_time_s', 'Mask_until_s', 'Blank_time_s', 'Blank_340', 'Volume_ul', 'Temperature_C', 'pH', 'Tris-HCl_mM', 'TPP_mM', 'MgCl2_mM', 'Pyruvate_mM', 'Acetaldehyde_mM', 'Ethanol_mM', 'NADH_mM', 'NAD_mM', 'Adh_ug_ml', 'Pdc_ug_ml', 'csv_filename', 'csv_exists']

Sample of data:
   Experiment_ID Ignore        Filename    Assay  Assay Group Cuvette  \
15      Assay 11    NaN   1222 PDC-9.KD  PDC_fwd  Varying Adh  CELL_1   
16      Assay 11    NaN   1222 PDC-9.KD  PDC_fwd  Varying Adh  CELL_2   
17      Assay 11    NaN   1222 PDC-9.KD  PDC_fwd  Varying Adh  CELL_3   
18      Assay 11    NaN  1222 PDC-11.KD  PDC_fwd  Varying Adh  CELL_1   
19      Assay 11    NaN  1222 PDC-11.KD  PDC_fwd  Varying Adh  CELL_2   

    Start_time_s  Mask_until_s  Blank_time_s  Blank_340  ...  MgCl2_mM  \
15         397.4         461.4         115.8     0.0040  ...       5.0   
16         397.4         416.

## Process all PDC assays

Loop through each assay and:
- Measure initial pyruvate concentration
- Process timecourse data
- Measure final pyruvate concentration
- Calculate maximum slope (V) and normalize by enzyme concentration (V/E)

In [None]:
# Process all PDC assays
from pda.data_io import extract_spectrum_at_time
from pda.timecourse import calculate_max_slope

all_timecourse_data = []
all_summary_data = []
errors = []

total_assays = len(pdc_assays)

for idx, (_, assay) in enumerate(pdc_assays.iterrows(), 1):
    print(f"Processing {idx}/{total_assays}: {assay['csv_filename']} - {assay['Cuvette']}")
    
    try:
        # Load spectral data
        csv_path = assay_data_path / assay['csv_filename']
        spectral_df = load_kinetic_data(str(csv_path), sample_filter=assay['Cuvette'])
        
        # Measure initial pyruvate concentration at blank time
        initial_pyr_expected = assay['Pyruvate_mM']
        blank_time = assay['Blank_time_s']
        
        if pd.notna(blank_time):
            # Extract spectrum at blank time
            blank_spectrum = extract_spectrum_at_time(
                spectral_df, 
                target_time=blank_time,
                min_wavelength=320,
                max_wavelength=420,
                verbose=False
            )
            
            # Calculate pyruvate with NADH fixed at 0
            blank_result = calculate_concentrations(
                spectrum_df=blank_spectrum,
                standards_df=standards_df,
                wavelength_range=(320, 420),
                absorbance_max=1.5,
                fit_intercept=True,
                fixed_nadh=0.0,
                plot=False
            )
            
            initial_pyr_measured = blank_result['PYR_Conc']
            
            # Check if measured pyruvate differs from expected by >50%
            if initial_pyr_measured is not None and initial_pyr_expected is not None:
                pyr_diff_pct = abs(initial_pyr_measured - initial_pyr_expected) / initial_pyr_expected * 100
                if pyr_diff_pct > 50:
                    print(f"  WARNING: Measured pyruvate ({initial_pyr_measured:.2f} mM) differs from expected "
                          f"({initial_pyr_expected:.2f} mM) by {pyr_diff_pct:.1f}%. Using expected value.")
                    use_expected_pyr = True
                else:
                    use_expected_pyr = False
            else:
                use_expected_pyr = True
                initial_pyr_measured = None
        else:
            print(f"  WARNING: No blank_time specified, cannot measure initial pyruvate")
            initial_pyr_measured = None
            use_expected_pyr = True
        
        # Process timecourse
        results = process_pdc_timecourse(
            spectral_df=spectral_df,
            standards_df=standards_df,
            assay_start_time=assay['Start_time_s'],
            blank_time=assay['Blank_time_s'],
            initial_pyruvate_mM=assay['Pyruvate_mM'],
            method='constrained',
            wavelength_range=(320, 420),
            absorbance_max=1.5,
            fit_intercept=True,
            plot=False,
            verbose=False
        )
        
        # Measure final pyruvate concentration
        if len(results) > 0:
            final_row = results.iloc[-1]
            final_nadh = final_row['NADH_mM']
            
            # Extract spectrum at final time
            final_spectrum = extract_spectrum_at_time(
                spectral_df,
                target_time=final_row['Time_s'],
                min_wavelength=320,
                max_wavelength=420,
                verbose=False
            )
            
            # Calculate pyruvate with NADH fixed at measured value
            final_result = calculate_concentrations(
                spectrum_df=final_spectrum,
                standards_df=standards_df,
                wavelength_range=(320, 420),
                absorbance_max=1.5,
                fit_intercept=True,
                fixed_nadh=final_nadh,
                plot=False
            )
            
            final_pyr_measured = final_result['PYR_Conc']
        else:
            final_pyr_measured = None
        
        # Calculate maximum slope
        slope_data = calculate_max_slope(
            timecourse_data=results,
            window_fraction=0.2,
            min_window_size=5,
            min_r_squared=0.9,
            slope_direction='negative',
            plot=False
        )
        
        # Add metadata columns to timecourse results
        results['Filename'] = assay['Filename']
        results['csv_filename'] = assay['csv_filename']
        results['Cuvette'] = assay['Cuvette']
        
        all_timecourse_data.append(results)
        
        # Create summary entry
        summary_entry = {
            'Filename': assay['Filename'],
            'csv_filename': assay['csv_filename'],
            'Cuvette': assay['Cuvette'],
            'initial_pyr_expected': initial_pyr_expected,
            'initial_pyr_measured': initial_pyr_measured,
            'used_expected_pyr': use_expected_pyr,
            'final_pyr_measured': final_pyr_measured,
            'n_timepoints': len(results),
            'Pdc_ug_ml': assay['Pdc_ug_ml']
        }
        
        # Add slope data to summary (excluding the 'fig' key)
        summary_entry.update({k: v for k, v in slope_data.items() if k != 'fig'})
        
        all_summary_data.append(summary_entry)
        
        print(f"  âœ“ Processed {len(results)} timepoints")
        
    except Exception as e:
        error_msg = f"{assay['csv_filename']} - {assay['Cuvette']}: {str(e)}"
        errors.append(error_msg)
        print(f"  âœ— ERROR: {str(e)}")
        continue

# Combine all timecourse data
timecourse_df = pd.concat(all_timecourse_data, ignore_index=True)
summary_df = pd.DataFrame(all_summary_data)

print(f"\n{'='*60}")
print(f"Processing complete!")
print(f"Successfully processed: {len(all_timecourse_data)}/{total_assays} assays")
print(f"Total timepoints: {len(timecourse_df)}")
print(f"Errors: {len(errors)}")

if errors:
    print("\nErrors encountered:")
    for error in errors:
        print(f"  - {error}")

## QC Plots

### Overview plot: All NADH timecourses

## QC Plots

### Overview plot: All NADH timecourses

In [35]:
# Plot all NADH timecourses overlaid
fig = go.Figure()

# Add a trace for each assay
for _, assay in pdc_assays.iterrows():
    mask = (timecourse_df['Filename'] == assay['Filename']) & (timecourse_df['Cuvette'] == assay['Cuvette'])
    assay_data = timecourse_df[mask]
    
    if len(assay_data) > 0:
        fig.add_trace(
            go.Scatter(
                x=assay_data['Time_Relative_s'],
                y=assay_data['NADH_mM'],
                mode='lines',
                name=f"{assay['csv_filename']} - {assay['Cuvette']}",
                hovertemplate='%{fullData.name}<br>Time: %{x:.1f} s<br>NADH: %{y:.4f} mM<extra></extra>'
            )
        )

fig.update_layout(
    title='All PDC Assays - NADH Concentration vs Time',
    xaxis_title='Time from Assay Start (s)',
    yaxis_title='NADH Concentration (mM)',
    template='plotly_white',
    height=600,
    hovermode='closest'
)

fig.show()

### Individual plots with maximum slope highlighted

In [36]:
# Plot each assay individually with max slope highlighted
from plotly.subplots import make_subplots

# Determine number of rows needed (3 plots per row)
n_assays = len(pdc_assays)
n_cols = 3
n_rows = (n_assays + n_cols - 1) // n_cols

# Create subplots
fig = make_subplots(
    rows=n_rows, 
    cols=n_cols,
    subplot_titles=[f"{row['csv_filename']}<br>{row['Cuvette']}" 
                    for _, row in pdc_assays.iterrows()],
    vertical_spacing=0.08,
    horizontal_spacing=0.06
)

for idx, (_, assay) in enumerate(pdc_assays.iterrows()):
    row = (idx // n_cols) + 1
    col = (idx % n_cols) + 1
    
    # Get timecourse data
    mask = (timecourse_df['Filename'] == assay['Filename']) & (timecourse_df['Cuvette'] == assay['Cuvette'])
    assay_data = timecourse_df[mask]
    
    if len(assay_data) > 0:
        # Plot NADH data
        fig.add_trace(
            go.Scatter(
                x=assay_data['Time_Relative_s'],
                y=assay_data['NADH_mM'],
                mode='markers',
                marker=dict(size=3, color='blue'),
                name='NADH',
                showlegend=False
            ),
            row=row, col=col
        )
        
        # Get slope data from summary
        summary_row = summary_df[
            (summary_df['Filename'] == assay['Filename']) & 
            (summary_df['Cuvette'] == assay['Cuvette'])
        ]
        
        if len(summary_row) > 0 and pd.notna(summary_row.iloc[0]['V_max_slope']):
            slope_start = summary_row.iloc[0]['slope_start_time']
            slope_end = summary_row.iloc[0]['slope_end_time']
            slope = summary_row.iloc[0]['V_max_slope']
            intercept = summary_row.iloc[0]['slope_intercept']
            r_squared = summary_row.iloc[0]['slope_r_squared']
            
            # Calculate fitted line
            time_fit = np.array([slope_start, slope_end])
            nadh_fit = slope * time_fit + intercept
            
            # Add linear fit line
            fig.add_trace(
                go.Scatter(
                    x=time_fit,
                    y=nadh_fit,
                    mode='lines',
                    line=dict(color='red', width=2),
                    name=f'V={slope:.4f}',
                    showlegend=False
                ),
                row=row, col=col
            )
            
            # Add annotation with slope value
            fig.add_annotation(
                x=slope_start,
                y=nadh_fit[0],
                text=f"V={slope:.4f} mM/s<br>RÂ²={r_squared:.3f}",
                showarrow=False,
                xanchor='left',
                yanchor='bottom',
                font=dict(size=8),
                row=row, col=col
            )

# Update axes
for i in range(1, n_rows + 1):
    for j in range(1, n_cols + 1):
        fig.update_xaxes(title_text="Time (s)" if i == n_rows else "", row=i, col=j)
        fig.update_yaxes(title_text="NADH (mM)" if j == 1 else "", row=i, col=j)

fig.update_layout(
    title='Individual Assay Plots with Maximum Slope',
    height=300 * n_rows,
    template='plotly_white',
    showlegend=False
)

fig.show()

ValueError: Vertical spacing cannot be greater than (1 / (rows - 1)) = 0.047619.
The resulting plot would have 22 rows (rows=22).

## Summary of processed data

In [None]:
# Display summary information
print("="*80)
print("TIMECOURSE DATAFRAME")
print("="*80)
print(f"Shape: {timecourse_df.shape}")
print(f"\nColumns: {list(timecourse_df.columns)}")
print(f"\nFirst few rows:")
print(timecourse_df.head())
print(f"\nData types:")
print(timecourse_df.dtypes)

print("\n" + "="*80)
print("SUMMARY DATAFRAME")
print("="*80)
print(f"Shape: {summary_df.shape}")
print(f"\nColumns: {list(summary_df.columns)}")
print(f"\nFirst few rows:")
print(summary_df.head())
print(f"\nData types:")
print(summary_df.dtypes)

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"\nInitial pyruvate (measured):")
print(summary_df['initial_pyr_measured'].describe())
print(f"\nFinal pyruvate (measured):")
print(summary_df['final_pyr_measured'].describe())
print(f"\nMaximum slope (V):")
print(summary_df['V_max_slope'].describe())
print(f"\nPDC concentration (ug/ml):")
print(summary_df['Pdc_ug_ml'].describe())

## to-do 1-27-2026
* During testing, just process one dataset
* Do the plotting of NADH vs. time and V/E measurements in the processing loop
* Eventually the slope calculation function should be moved into the timecourse.py folder