<a href="https://colab.research.google.com/github/eoinleen/AKTA-Prime_final/blob/main/Final_SEC_AKTA_prime_V3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import AutoMinorLocator
import os
import warnings
from pathlib import Path

# ==============================================================================
# CONFIGURATION SECTION - MODIFY THESE VALUES AS NEEDED
# ==============================================================================
# File path options:
# Option 1: Set to None to prompt for file upload
# Option 2: Set to Google Drive path like '/content/drive/MyDrive/SEC-data/file.xls'
# Option 3: Set to local path like './data/file.xls'
FILE_PATH = None  # Set to None to trigger file upload prompt

# Alternative: Uncomment and modify one of these examples:
# FILE_PATH = '/content/drive/MyDrive/SEC-data/20250326-human_mysm1.xls'  # Google Drive
# FILE_PATH = './local_data/my_sec_file.xls'  # Local file

# Fraction collection range (in mL)
FRACTION_START = 60.65
FRACTION_END = 72.75

# Concentrated fractions range (subset of fractions to be concentrated)
# Specific fractions 59-65 (volumes from the list provided)
CONCENTRATE_START = 62.85  # Starting point for concentration (Fraction 59)
CONCENTRATE_END = 69.45    # Ending point for concentration (Fraction 65)

# Display range on the x-axis (in mL)
X_AXIS_MIN = 35
X_AXIS_MAX = 95

# Direct column indices for your specific AKTA file format
# These are 0-based indices (first column = 0)
VOLUME_COLUMN = 0        # ml (volume) column
ABSORBANCE_COLUMN = 1    # mAu (absorbance) column
CONDUCTIVITY_COLUMN = 3  # mS/cm (conductivity) column
FRACTION_VOL_COLUMN = 12  # ml for fractions
FRACTION_NUM_COLUMN = 13  # Fraction numbers

# Other settings
SHOW_CONDUCTIVITY = True      # Show conductivity trace
SAVE_OUTPUT = True            # Save plot to file
AUTO_DOWNLOAD = True          # Auto-download plot instead of saving to drive
OUTPUT_PATH = './sec_plot.png'  # Where to save the plot (used if AUTO_DOWNLOAD is False)
DOWNLOAD_FILENAME = 'sec_analysis_plot.png'  # Filename for auto-download
SHOW_LEGEND = False           # Whether to display the legend
SHOW_FRACTION_LABEL = False   # Whether to show the fraction range label
SHOW_CONCENTRATION = True     # Whether to show concentrated fractions
# ==============================================================================

def get_file_path():
    """
    Get file path either from configuration or by prompting user for upload.

    Returns:
        str: Path to the data file
    """
    global FILE_PATH

    # If FILE_PATH is already set and exists, use it
    if FILE_PATH is not None:
        if os.path.exists(FILE_PATH):
            print(f"Using specified file: {FILE_PATH}")
            return FILE_PATH
        else:
            print(f"Warning: Specified file path does not exist: {FILE_PATH}")
            print("Falling back to file upload prompt...")

    # Prompt for file upload
    print("\n" + "="*60)
    print("SEC DATA ANALYZER - FILE UPLOAD")
    print("="*60)
    print("Please upload your AKTA SEC data file (.xls or .xlsx)")
    print("The file should contain columns for:")
    print("- Volume (ml)")
    print("- Absorbance (mAu)")
    print("- Conductivity (mS/cm)")
    print("- Fraction data")
    print("="*60)

    # In Jupyter/Colab environment, use file upload widget
    try:
        from google.colab import files
        print("Google Colab detected - using Colab file upload...")
        uploaded = files.upload()

        if not uploaded:
            raise ValueError("No file was uploaded")

        # Get the first (and presumably only) uploaded file
        filename = list(uploaded.keys())[0]
        print(f"File uploaded successfully: {filename}")
        return filename

    except ImportError:
        # Not in Colab, try other upload methods
        try:
            # Try ipywidgets for Jupyter
            from ipywidgets import FileUpload
            from IPython.display import display
            import io

            print("Jupyter environment detected - using widget upload...")
            uploader = FileUpload(
                accept='.xls,.xlsx',
                multiple=False,
                description='Upload SEC Data'
            )
            display(uploader)

            # Wait for upload (this is a simplified version)
            print("Please use the upload widget above to select your file.")
            print("After uploading, run the analysis again.")
            return None

        except ImportError:
            # Fallback to manual path input
            print("Manual file path input mode...")
            print("Please ensure your file is accessible and enter the full path:")

            while True:
                file_path = input("Enter file path: ").strip()
                if not file_path:
                    print("No path entered. Please try again.")
                    continue

                if os.path.exists(file_path):
                    print(f"File found: {file_path}")
                    return file_path
                else:
                    print(f"File not found: {file_path}")
                    retry = input("Try again? (y/n): ").strip().lower()
                    if retry != 'y':
                        raise FileNotFoundError("No valid file path provided")

def analyze_sec_data(file_path=None, config=None):
    """
    Analyze Size Exclusion Chromatography data from AKTA and create visualization

    Args:
        file_path (str, optional): Path to the XLS file. If None, will prompt for upload
        config (dict, optional): Configuration parameters for analysis and visualization

    Returns:
        tuple: (DataFrame, dict, Figure) - Processed data, analysis results, and plot
    """
    # Get file path if not provided
    if file_path is None:
        file_path = get_file_path()
        if file_path is None:
            print("No file provided. Please upload a file and try again.")
            return None, None, None

    # Default configuration
    default_config = {
        'header_row': 2,  # Using row 2 (third row) which usually has ml, mAu etc.
        'fraction_range': (FRACTION_START, FRACTION_END),
        'concentrate_range': (CONCENTRATE_START, CONCENTRATE_END),
        'x_axis_limits': (X_AXIS_MIN, X_AXIS_MAX),
        'show_conductivity': SHOW_CONDUCTIVITY,
        'save_output': SAVE_OUTPUT,
        'auto_download': AUTO_DOWNLOAD,
        'output_path': OUTPUT_PATH,
        'download_filename': DOWNLOAD_FILENAME,
        'volume_column': VOLUME_COLUMN,
        'absorbance_column': ABSORBANCE_COLUMN,
        'conductivity_column': CONDUCTIVITY_COLUMN,
        'fraction_vol_column': FRACTION_VOL_COLUMN,
        'fraction_num_column': FRACTION_NUM_COLUMN,
        'show_legend': SHOW_LEGEND,
        'show_fraction_label': SHOW_FRACTION_LABEL,
        'show_concentration': SHOW_CONCENTRATION
    }

    # Use default config updated with provided config
    if config is None:
        config = {}
    conf = {**default_config, **config}

    # Verify input file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Input file not found: {file_path}")

    # Load data from file
    print(f"\nLoading data from {file_path}...")
    try:
        # First, read in the raw data
        warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

        # Handle both .xls and .xlsx files
        file_extension = Path(file_path).suffix.lower()
        if file_extension in ['.xls', '.xlsx']:
            df_raw = pd.read_excel(file_path, header=conf['header_row'])
        else:
            raise ValueError(f"Unsupported file format: {file_extension}. Please use .xls or .xlsx files.")

        # Print some information about the file
        print(f"Found {len(df_raw.columns)} columns in the data file")
        print(f"File contains {len(df_raw)} rows of data")

        # Create a new DataFrame for our analysis
        df = pd.DataFrame()

        # Extract data using the specified column indices
        # Main volume and absorbance data
        df['ml1'] = pd.to_numeric(df_raw.iloc[:, conf['volume_column']], errors='coerce')
        df['mAu'] = pd.to_numeric(df_raw.iloc[:, conf['absorbance_column']], errors='coerce')

        # Get conductivity data
        cond_col = conf['conductivity_column']
        cond_vol_col = conf['volume_column'] + 2  # Usually volume column for conductivity
        if cond_col < len(df_raw.columns):
            df['ml2'] = pd.to_numeric(df_raw.iloc[:, cond_vol_col], errors='coerce')
            df['mS/cm'] = pd.to_numeric(df_raw.iloc[:, cond_col], errors='coerce')
        else:
            df['ml2'] = df['ml1']
            df['mS/cm'] = 0
            conf['show_conductivity'] = False
            print("Warning: Conductivity column not found, disabling conductivity display")

        # Get fraction data
        frac_vol_col = conf['fraction_vol_column']
        frac_num_col = conf['fraction_num_column']

        if frac_vol_col < len(df_raw.columns):
            df['ml4'] = pd.to_numeric(df_raw.iloc[:, frac_vol_col], errors='coerce')
            if frac_num_col < len(df_raw.columns):
                df['Fractions'] = df_raw.iloc[:, frac_num_col].astype(str)
            else:
                df['Fractions'] = ""
                print("Warning: Fraction number column not found")
        else:
            df['ml4'] = df['ml1']
            df['Fractions'] = ""
            print("Warning: Fraction volume column not found")

        # Clean the data
        df = df.dropna(subset=['ml1', 'mAu'])

        # Print data ranges for verification
        print(f"\nData Summary:")
        print(f"Absorbance (mAu) range: {df['mAu'].min():.3f} to {df['mAu'].max():.3f}")
        print(f"Volume (ml1) range: {df['ml1'].min():.2f} to {df['ml1'].max():.2f}")
        if conf['show_conductivity']:
            print(f"Conductivity range: {df['mS/cm'].min():.3f} to {df['mS/cm'].max():.3f}")

        print(f"Loaded {len(df)} valid data points successfully")

    except Exception as e:
        raise RuntimeError(f"Error reading data file: {e}")

    # Analyze data
    results = {}

    # Find the main peak
    if len(df) > 0:
        max_absorbance = df['mAu'].max()
        max_absorbance_idx = df['mAu'].idxmax()
        max_absorbance_vol = df.loc[max_absorbance_idx, 'ml1']
        results['max_absorbance'] = {
            'value': max_absorbance,
            'volume': max_absorbance_vol
        }
        print(f"\nMain peak detected: {max_absorbance:.2f} mAU at {max_absorbance_vol:.2f} ml")
    else:
        print("No valid data points found!")
        return None, None, None

    # Filter for fraction range
    fraction_start, fraction_end = conf['fraction_range']
    fractions_df = df[(df['ml4'] >= fraction_start) & (df['ml4'] <= fraction_end)]
    results['fractions'] = {
        'range': conf['fraction_range'],
        'count': len(fractions_df)
    }
    print(f"Selected {len(fractions_df)} data points in fraction range {fraction_start:.2f}-{fraction_end:.2f} ml")

    # Filter for concentrated fractions
    concentrate_start, concentrate_end = conf['concentrate_range']
    concentrate_df = df[(df['ml4'] >= concentrate_start) & (df['ml4'] <= concentrate_end)]
    results['concentrate'] = {
        'range': conf['concentrate_range'],
        'count': len(concentrate_df)
    }
    print(f"Selected {len(concentrate_df)} data points in concentration range {concentrate_start:.2f}-{concentrate_end:.2f} ml")

    # Extract fraction information for console display only (not for labeling)
    fraction_rows = df[df['Fractions'].str.strip() != ''].copy()

    # Clean fraction numbers - try to extract numeric part
    def extract_number(val):
        try:
            if pd.isna(val) or val == '':
                return ''
            # Try to find numeric part
            import re
            match = re.search(r'(\d+)', str(val))
            if match:
                return match.group(1)
            return str(val)
        except:
            return str(val)

    fraction_rows['FractionNum'] = fraction_rows['Fractions'].apply(extract_number)

    # Print fraction information in our range of interest
    if len(fraction_rows) > 0:
        print("\nFraction Information (Volume vs. Fraction Number):")
        print("------------------------------------------------")

        # Filter to fractions near our range
        display_fractions = fraction_rows[
            (fraction_rows['ml4'] >= fraction_start - 10) &
            (fraction_rows['ml4'] <= fraction_end + 10)
        ].sort_values(by='ml4')

        for _, row in display_fractions.iterrows():
            is_concentrated = (row['ml4'] >= concentrate_start) and (row['ml4'] <= concentrate_end)
            conc_indicator = "* CONCENTRATED *" if is_concentrated else ""
            print(f"Volume: {row['ml4']:.2f} ml - Fraction: {row['FractionNum']} {conc_indicator}")
    else:
        print("\nNo fraction information found in the data")

    # Create visualization
    fig, ax1 = create_plot(df, fractions_df, concentrate_df, conf, results)

    # Handle output based on settings
    if conf['save_output']:
        if conf['auto_download']:
            # Auto-download the plot
            download_plot(fig, conf['download_filename'])
        else:
            # Save to specified path
            save_path = conf['output_path']
            plt.savefig(save_path, dpi=300, bbox_inches='tight', transparent=True)
            print(f"\nPlot saved to {save_path}")

    # Return processed data and results
    return df, results, fig

def download_plot(fig, filename):
    """
    Auto-download plot to browser downloads folder

    Args:
        fig: Matplotlib figure object
        filename (str): Desired filename for download
    """
    import io

    try:
        # Try Google Colab download method first
        from google.colab import files

        # Save figure to bytes buffer
        buf = io.BytesIO()
        fig.savefig(buf, format='png', dpi=300, bbox_inches='tight', transparent=True)
        buf.seek(0)

        # Write to temporary file and download
        with open(filename, 'wb') as f:
            f.write(buf.getvalue())

        files.download(filename)
        print(f"\nPlot auto-downloaded as: {filename}")

        # Clean up temporary file
        try:
            os.remove(filename)
        except:
            pass

    except ImportError:
        # Not in Colab, try Jupyter/IPython method
        try:
            from IPython.display import HTML, display
            import base64

            # Save figure to bytes buffer
            buf = io.BytesIO()
            fig.savefig(buf, format='png', dpi=300, bbox_inches='tight', transparent=True)
            buf.seek(0)

            # Create download link
            b64_data = base64.b64encode(buf.getvalue()).decode()
            download_link = f'''
            <a download="{filename}"
               href="data:image/png;base64,{b64_data}"
               style="background-color: #4CAF50; color: white; padding: 10px 20px;
                      text-decoration: none; border-radius: 5px; font-weight: bold;">
               📥 Download Plot ({filename})
            </a>
            '''

            print(f"\nClick the link below to download your plot:")
            display(HTML(download_link))

        except ImportError:
            # Fallback to regular save
            print(f"\nAuto-download not available in this environment.")
            print(f"Saving plot as: {filename}")
            fig.savefig(filename, dpi=300, bbox_inches='tight', transparent=True)
            print(f"Plot saved successfully. Please manually download: {filename}")

def create_plot(df, fractions_df, concentrate_df, conf, results):
    """
    Create visualization of SEC data

    Args:
        df (DataFrame): Full SEC data
        fractions_df (DataFrame): Filtered data for fraction range
        concentrate_df (DataFrame): Filtered data for concentration range
        conf (dict): Configuration parameters
        results (dict): Analysis results

    Returns:
        tuple: (Figure, Axis) - Matplotlib figure and primary axis
    """
    # Set font properties
    plt.rcParams.update({
        'font.family': 'DejaVu Sans',
        'font.size': 12,
        'axes.titlesize': 14,
        'axes.labelsize': 12,
        'xtick.labelsize': 10,
        'ytick.labelsize': 10
    })

    # Create figure and primary axis
    fig, ax1 = plt.subplots(figsize=(12, 7))

    # Plot mL versus mAU with black dashed line
    ax1.plot(df['ml1'], df['mAu'], linestyle='--', linewidth=1.5,
             color='black', label='UV 280nm')

    # Calculate appropriate y-axis limits
    # Include some padding (10%) above the maximum absorbance
    y_max = df['mAu'].max() * 1.1
    y_min = min(0, df['mAu'].min())  # Include negative values if present
    ax1.set_ylim(y_min, y_max)

    # Add conductivity on secondary axis if enabled
    if conf['show_conductivity']:
        ax2 = ax1.twinx()
        ax2.plot(df['ml2'], df['mS/cm'], color='blue', alpha=0.6,
                 linestyle='-', linewidth=1, label='Conductivity')
        ax2.set_ylabel('Conductivity (mS/cm)', color='blue')
        ax2.tick_params(axis='y', colors='blue')
        ax2.spines['right'].set_color('blue')

    # Plot filtered fractions as red markers at the bottom
    if len(fractions_df) > 0:
        marker_y_pos = y_min + (y_max - y_min) * 0.01  # Just slightly above bottom

        # Use small squares instead of X markers
        ax1.scatter(fractions_df['ml4'], [marker_y_pos]*len(fractions_df),
                    marker='s', color='red', s=15, label='Fractions', zorder=3)

    # Add dark green line for concentrated fractions
    if conf['show_concentration'] and len(concentrate_df) > 0:
        # Get the range
        concentrate_start, concentrate_end = conf['concentrate_range']

        # Define a darker green color for both the line and text
        dark_green = '#006400'  # Dark green color

        # Draw a thinner dark green line just above the fractions
        line_y_pos = y_min + (y_max - y_min) * 0.03  # Above the fraction markers
        ax1.plot([concentrate_start, concentrate_end], [line_y_pos, line_y_pos],
                 color=dark_green, linewidth=2, alpha=0.9, solid_capstyle='butt', zorder=4)

        # Add "Concentrated" label above the line
        text_y_pos = line_y_pos + (y_max - y_min) * 0.02
        ax1.text((concentrate_start + concentrate_end)/2, text_y_pos,
                'Concentrated', ha='center', va='bottom', color=dark_green,
                fontsize=10, fontweight='bold')

    # Add a simple label indicating the fraction range (optional)
    if conf['show_fraction_label']:
        fraction_start, fraction_end = conf['fraction_range']
        text_y_pos = y_min - (y_max - y_min) * 0.05
        ax1.text((fraction_start + fraction_end)/2, text_y_pos,
                f"Fractions: {fraction_start:.1f} - {fraction_end:.1f} ml",
                ha='center', va='top', color='red', fontsize=10)

    # Set the x-axis limits
    ax1.set_xlim(conf['x_axis_limits'])

    # Format plot appearance
    ax1.set_facecolor('none')    # Transparent background
    fig.patch.set_alpha(0.0)     # Transparent figure
    ax1.grid(False)              # Remove grid lines by default

    # Add minor ticks
    ax1.xaxis.set_minor_locator(AutoMinorLocator())
    ax1.yaxis.set_minor_locator(AutoMinorLocator())

    # Add light grid
    ax1.grid(True, linestyle=':', alpha=0.3)

    # Update axis labels and title
    plt.title('AKTA Size Exclusion Chromatography Analysis')
    ax1.set_xlabel('Volume (mL)')
    ax1.set_ylabel('Absorbance (mAU)', color='black')

    # Create legend if requested
    if conf['show_legend']:
        if conf['show_conductivity']:
            lines1, labels1 = ax1.get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
        else:
            ax1.legend(loc='upper left')

    plt.tight_layout()
    return fig, ax1

def run_analysis():
    """
    Convenience function to run the complete analysis workflow
    """
    print("SEC Data Analyzer - Starting Analysis...")
    print("=" * 50)

    try:
        df, results, fig = analyze_sec_data()
        if fig is not None:
            print("\nAnalysis completed successfully!")
            print("Displaying plot...")
            plt.show()
            return df, results, fig
        else:
            print("Analysis failed - no data to display")
            return None, None, None
    except Exception as e:
        print(f"Error during analysis: {e}")
        return None, None, None

# Example usage
if __name__ == "__main__":
    # Run the analysis
    df, results, fig = run_analysis()

    # Optionally, you can also call analyze_sec_data directly with a specific file:
    # df, results, fig = analyze_sec_data('/path/to/your/file.xls')