In [1]:
import os
import glob
import pandas as pd
import numpy as np
from io import StringIO


def detect_file_format(lines, debug=False):
    """
    Detects the file format from the header lines.
    
    Parameters:
      lines (list): List of header lines from the file.
      debug (bool): If True, prints debug info.
      
    Returns:
      str: Format descriptor string.
    """
    has_time_series = any("Time Series" in line for line in lines[:10])
    
    # Check sensor group line
    sensor_group_line = lines[3].strip()
    sensor_tokens = [token.strip() for token in sensor_group_line.split(',')]
    non_empty_tokens = [token for token in sensor_tokens if token]
    
    if len(non_empty_tokens) == 3 and "FDS" in sensor_group_line and "FCU" in sensor_group_line and "FCR" in sensor_group_line:
        return "FULL_FORMAT"
    elif len(non_empty_tokens) == 2 and "FCU" in sensor_group_line and "FCR" in sensor_group_line:
        if has_time_series:
            return "COMPACT_WITH_TIMESERIES"
        else:
            return "COMPACT_FORMAT"
    else:
        return "UNKNOWN_FORMAT"


def read_sensor_data_with_metadata(file_path, debug=False):
    """
    Enhanced reader that handles various EMG data formats:
    
    Format 1: 3 sensors (FDS, FCU, FCR) with ACC and GYRO data
    Example: 
        FDS (81770), , , , , , , FCU (81728), , , , , , , FCR (81745)
        sensor mode: 50, , , , , , , sensor mode: 50, , , , , , , sensor mode: 40
    
    Format 2: 2 sensors (FCU, FCR) with only EMG data, no time series
    Example:
        FCU (81770), FCR (81745)
        sensor mode: 40, sensor mode: 40
    
    Format 3: 2 sensors with time series columns
    Example:
        FCU (81770), , FCR (81745)
        sensor mode: 40, , sensor mode: 40
        EMG 1 Time Series (s), EMG 1 (mV), EMG 1 Time Series (s), EMG 1 (mV)
    
    It standardizes column names and fills in nulls for missing sensors/channels.
    
    Parameters:
      file_path (str): Path to the CSV file.
      debug (bool): If True, prints detailed debug output.
    
    Returns:
      df (pd.DataFrame): DataFrame with standardized columns.
      metadata (dict): Dictionary of parsed metadata.
    """
    with open(file_path, 'r') as f:
        all_lines = f.readlines()
    
    # Detect file format by scanning header
    file_format = detect_file_format(all_lines, debug=debug)
    
    metadata = {}
    # --- Parse first 3 lines (common for all formats) ---
    # Line 0: Application
    line = all_lines[0].strip()
    if ',' in line:
        key, value = line.split(',', 1)
        metadata[key.strip().rstrip(':')] = value.strip()
    else:
        metadata['Application'] = line

    # Line 1: Date/Time
    line = all_lines[1].strip()
    if ',' in line:
        key, value = line.split(',', 1)
        metadata[key.strip().rstrip(':')] = value.strip()
    else:
        metadata['Date/Time'] = line

    # Line 2: Collection Length (seconds)
    line = all_lines[2].strip()
    if ',' in line:
        key, value = line.split(',', 1)
        metadata[key.strip().rstrip(':')] = value.strip()
    else:
        metadata['Collection Length (seconds)'] = line

    # --- Determine if we have time series columns ---
    has_time_series = any("Time Series" in line for line in all_lines[:10])
    metadata['HasTimeSeries'] = has_time_series
    metadata['FileFormat'] = file_format
    
    # --- Parse sensor groups ---
    sensor_group_line = all_lines[3].strip()
    sensor_mode_line = all_lines[4].strip()
    
    if ',' in sensor_group_line and len(sensor_group_line.split(',')) > 1:
        # Multiple sensors format
        sensor_group_tokens = [token.strip() for token in sensor_group_line.split(',')]
        
        # For Format 3, filter out empty tokens
        if file_format == "COMPACT_WITH_TIMESERIES":
            non_empty_tokens = [token for token in sensor_group_tokens if token]
            sensor_group_tokens = non_empty_tokens
            
        # Propagate non-empty values forward
        sensor_groups = []
        last = None
        for token in sensor_group_tokens:
            if token:
                last = token
            sensor_groups.append(last if last is not None else "")
        
        # Similarly for sensor modes
        sensor_mode_tokens = [token.strip() for token in sensor_mode_line.split(',')]
        sensor_modes = []
        last_mode = None
        for token in sensor_mode_tokens:
            if token:
                last_mode = token
            sensor_modes.append(last_mode if last_mode is not None else "")
        
        # Store these in metadata
        metadata['SensorGroups'] = sensor_groups
        metadata['SensorModes'] = sensor_modes
        
        if debug:
            print(f"[read_sensor_data_with_metadata] Format detected: {file_format}")
            print(f"[read_sensor_data_with_metadata] SensorGroups: {sensor_groups}")
            print(f"[read_sensor_data_with_metadata] SensorModes: {sensor_modes}")
    else:
        # Legacy dataset: use line 3 and 4 as single values.
        metadata['Sensor'] = sensor_group_line
        metadata['Sensor Mode'] = sensor_mode_line

    # --- Header row for sensor data ---
    header_line = all_lines[5].strip()
    original_col_names = [col.strip() for col in header_line.split(',')]
    
    # Special handling for COMPACT_WITH_TIMESERIES format to avoid duplicate column names
    if file_format == "COMPACT_WITH_TIMESERIES" and len(sensor_groups) < len(original_col_names):
        if debug:
            print(f"[read_sensor_data_with_metadata] Time series format detected: {len(sensor_groups)} sensors, {len(original_col_names)} columns")
        
        # Get unique sensor names
        sensor_names = []
        for group in sensor_groups:
            sensor_name = group.split('(')[0].strip()
            sensor_names.append(sensor_name)
        
        # Create unique column names by alternating between sensors
        new_col_names = []
        for i, col in enumerate(original_col_names):
            # Calculate which sensor this column belongs to
            sensor_idx = i // (len(original_col_names) // len(sensor_names))
            if sensor_idx >= len(sensor_names):
                sensor_idx = len(sensor_names) - 1
            
            # Append sensor name to column for uniqueness
            sensor_name = sensor_names[sensor_idx]
            new_col_names.append(f"{col} - {sensor_name}")
        
        if debug:
            print(f"[read_sensor_data_with_metadata] Created unique column names for time series format: {new_col_names}")
    
    # Update column names by appending sensor group for other formats
    elif 'SensorGroups' in metadata:
        if len(metadata['SensorGroups']) >= len(original_col_names):
            new_col_names = []
            for i, col in enumerate(original_col_names):
                group = metadata['SensorGroups'][i]
                if group:
                    # Extract sensor name (e.g., "FCU" from "FCU (81770)")
                    sensor_name = group.split('(')[0].strip()
                    new_col_names.append(f"{col} - {sensor_name}")
                else:
                    new_col_names.append(col)
            if debug:
                print("[read_sensor_data_with_metadata] New column names set with sensor names.")
        else:
            # For other formats with insufficient sensor groups, make column names unique with index
            if debug:
                print(f"[read_sensor_data_with_metadata] Warning: Not enough sensor group entries ({len(metadata['SensorGroups'])}) for columns ({len(original_col_names)})")
            
            # Check for potential duplicates in original column names
            if len(set(original_col_names)) < len(original_col_names):
                if debug:
                    print("[read_sensor_data_with_metadata] Duplicate column names detected, making them unique")
                new_col_names = []
                for i, col in enumerate(original_col_names):
                    new_col_names.append(f"{col}_col{i}")
            else:
                new_col_names = original_col_names
    else:
        # For all other formats, ensure column names are unique
        if len(set(original_col_names)) < len(original_col_names):
            if debug:
                print("[read_sensor_data_with_metadata] Duplicate column names detected, making them unique")
            new_col_names = []
            for i, col in enumerate(original_col_names):
                new_col_names.append(f"{col}_col{i}")
        else:
            new_col_names = original_col_names

    if debug:
        print(f"[read_sensor_data_with_metadata] Using column names: {new_col_names}")

    # Determine sample rate line index (typically line 6)
    sample_rate_line_idx = 6
    
    # Read the sensor data (starting after sample rate line)
    data_str = ''.join(all_lines[sample_rate_line_idx + 1:])
    
    try:
        df = pd.read_csv(StringIO(data_str), header=None, names=new_col_names)
    except ValueError as e:
        if "Duplicate names" in str(e):
            if debug:
                print(f"[read_sensor_data_with_metadata] Error: {e}. Adding unique suffixes to column names.")
            
            # Add unique suffixes to ensure no duplicates
            unique_col_names = []
            for i, name in enumerate(new_col_names):
                unique_col_names.append(f"{name}_{i}")
            
            if debug:
                print(f"[read_sensor_data_with_metadata] Using modified column names: {unique_col_names}")
            
            df = pd.read_csv(StringIO(data_str), header=None, names=unique_col_names)
        else:
            raise

    # Handle timestamp creation based on format
    if has_time_series:
        # Find time series columns
        ts_cols = [col for col in df.columns if "Time Series" in col]
        if ts_cols:
            # Use the first time series column for timestamps
            ts_col = ts_cols[0]
            start_time = pd.to_datetime(metadata.get('Date/Time', None))
            df['Timestamp'] = start_time + pd.to_timedelta(df[ts_col], unit='s')
            if debug:
                print(f"[read_sensor_data_with_metadata] Created timestamps from {ts_col}")
        else:
            # Fall back to generated timestamps
            if debug:
                print("[read_sensor_data_with_metadata] No time series columns found despite HasTimeSeries=True")
            collection_length = float(metadata.get('Collection Length (seconds)', 0))
            start_time = pd.to_datetime(metadata.get('Date/Time', None))
            num_samples = len(df)
            time_offsets = np.linspace(0, collection_length, num_samples)
            df['Timestamp'] = start_time + pd.to_timedelta(time_offsets, unit='s')
    else:
        # Create a running Timestamp column from metadata
        collection_length = float(metadata.get('Collection Length (seconds)', 0))
        start_time = pd.to_datetime(metadata.get('Date/Time', None))
        num_samples = len(df)
        time_offsets = np.linspace(0, collection_length, num_samples)
        df['Timestamp'] = start_time + pd.to_timedelta(time_offsets, unit='s')

    # Add metadata columns to the DataFrame (except sensor group and mode lists)
    for key, value in metadata.items():
        if key not in ['SensorGroups', 'SensorModes']:
            df[key] = value
    
    # Check for required sensors and create placeholder columns if missing
    all_sensors = ['FDS', 'FCU', 'FCR']
    available_sensors = []
    
    # Identify which sensors are present
    for col in df.columns:
        for sensor in all_sensors:
            if f" - {sensor}" in col and sensor not in available_sensors:
                available_sensors.append(sensor)
    
    missing_sensors = [s for s in all_sensors if s not in available_sensors]
    
    # Add placeholders for missing sensors
    if missing_sensors:
        if debug:
            print(f"[read_sensor_data_with_metadata] Missing sensors: {missing_sensors}")
        
        for sensor in missing_sensors:
            # Add placeholder EMG column
            df[f'EMG 1 (mV) - {sensor}'] = np.nan
            
            # Add placeholder accelerometer and gyroscope columns
            df[f'ACC X (G) - {sensor}'] = np.nan
            df[f'ACC Y (G) - {sensor}'] = np.nan
            df[f'ACC Z (G) - {sensor}'] = np.nan
            df[f'GYRO X (deg/s) - {sensor}'] = np.nan
            df[f'GYRO Y (deg/s) - {sensor}'] = np.nan
            df[f'GYRO Z (deg/s) - {sensor}'] = np.nan
    
    # Check for missing ACC and GYRO columns for available sensors
    for sensor in available_sensors:
        # Check if ACC columns exist for this sensor
        if not any(col.startswith('ACC X') and f' - {sensor}' in col for col in df.columns):
            if debug:
                print(f"[read_sensor_data_with_metadata] Missing ACC columns for {sensor}")
            df[f'ACC X (G) - {sensor}'] = np.nan
            df[f'ACC Y (G) - {sensor}'] = np.nan
            df[f'ACC Z (G) - {sensor}'] = np.nan
        
        # Check if GYRO columns exist for this sensor
        if not any(col.startswith('GYRO X') and f' - {sensor}' in col for col in df.columns):
            if debug:
                print(f"[read_sensor_data_with_metadata] Missing GYRO columns for {sensor}")
            df[f'GYRO X (deg/s) - {sensor}'] = np.nan
            df[f'GYRO Y (deg/s) - {sensor}'] = np.nan
            df[f'GYRO Z (deg/s) - {sensor}'] = np.nan
    
    if debug:
        print(f"[read_sensor_data_with_metadata] Final DataFrame shape: {df.shape}")
        print(f"[read_sensor_data_with_metadata] Final column names: {df.columns.tolist()}")
    else:
        print("read_sensor_data_with_metadata completed.")
    
    return df, metadata


def process_file(file_path, debug=False):
    """
    Processes a single sensor CSV file:
      - Reads the file and its metadata.
      - Performs cleaning and type conversion.
    
    Parameters:
      file_path (str): Path to the CSV file.
      debug (bool): If True, prints detailed debug output.
    
    Returns:
      pd.DataFrame: Processed DataFrame.
    """
    if debug:
        print(f"\n[process_file] Processing file: {file_path}")
    else:
        print(f"Processing file: {os.path.basename(file_path)}")

    # Step 1: Read data and metadata.
    df, metadata = read_sensor_data_with_metadata(file_path, debug=debug)
    if debug:
        print(f"[process_file] Format: {metadata.get('FileFormat', 'Unknown')}")
        print(f"[process_file] DataFrame shape after reading: {df.shape}")
    else:
        print("Data read completed.")

    # Step 2: Display minimal summary if in debug mode.
    if debug:
        print(f"[process_file] Descriptive Statistics:\n{df.describe()}")
        print(f"[process_file] Data types:\n{df.dtypes}")
    else:
        print("Basic summary displayed.")

    # Step 3: Dynamically identify numeric sensor columns.
    base_names = ['ACC X (G)', 'ACC Y (G)', 'ACC Z (G)', 
                  'GYRO X (deg/s)', 'GYRO Y (deg/s)', 'GYRO Z (deg/s)',
                  'EMG 1 (mV)']
    numeric_cols = []
    for base in base_names:
        matches = [col for col in df.columns if col.startswith(base)]
        numeric_cols.extend(matches)
    if debug:
        print(f"[process_file] Identified numeric sensor columns: {numeric_cols}")

    # Clean data: Remove rows with blank numeric values (but keep rows with NaN).
    # Only check non-NaN columns to avoid removing all rows when a whole column is NaN
    non_nan_cols = [col for col in numeric_cols if not df[col].isna().all()]
    if non_nan_cols:
        mask = df[non_nan_cols].apply(lambda col: col.astype(str).str.strip() == '').any(axis=1)
        if debug:
            print(f"[process_file] Rows with blank numeric values: {mask.sum()}")
        df = df[~mask]
    else:
        if debug:
            print("[process_file] All numeric columns are NaN. Skipping blank value check.")

    # Convert identified numeric columns to numeric type.
    for col in numeric_cols:
        try:
            if not df[col].isna().all():  # Skip conversion if column is all NaN
                df[col] = pd.to_numeric(df[col], errors='raise')
        except Exception as e:
            print(f"[process_file] Error converting column {col}: {e}")
            if debug:
                raise
    if debug:
        print(f"[process_file] Data shape after cleaning: {df.shape}")

    # Remove the Time Series columns as they're redundant after timestamp creation
    ts_cols = [col for col in df.columns if "Time Series" in col]
    if ts_cols:
        df = df.drop(columns=ts_cols)
        if debug:
            print(f"[process_file] Removed {len(ts_cols)} time series columns")

    print("File processing completed.\n")
    return df


def main(debug=False, input_folder='./data/raw/', output_file='./data/processed/processed_emg_data.parquet'):
    """
    Processes all CSV files in the specified folder, stacks them into one DataFrame,
    and writes the output to a Parquet file.
    
    Parameters:
      debug (bool): If True, prints detailed debug information.
      input_folder (str): Folder containing the CSV files.
      output_file (str): Path for the output Parquet file.
    
    Returns:
      pd.DataFrame: Final processed DataFrame.
    """
    # Ensure input folder exists.
    if not os.path.isdir(input_folder):
        raise FileNotFoundError(f"Input folder '{input_folder}' does not exist.")
    
    # Find all CSV files in the folder.
    csv_files = glob.glob(os.path.join(input_folder, '*.csv'))
    if not csv_files:
        raise FileNotFoundError("No CSV files found in the input folder.")
    
    if debug:
        print(f"[main] Found {len(csv_files)} CSV files in '{input_folder}'.")
    else:
        print(f"Found {len(csv_files)} CSV file(s).")

    processed_dfs = []
    formats_found = {}
    
    for file in csv_files:
        df = process_file(file, debug=debug)
        
        # Keep track of which formats were found
        format_type = df['FileFormat'].iloc[0] if 'FileFormat' in df.columns else 'Unknown'
        formats_found[format_type] = formats_found.get(format_type, 0) + 1
        
        # Add a column to indicate source file.
        df['SourceFile'] = os.path.basename(file)
        processed_dfs.append(df)
    
    # Stack all DataFrames (row-wise).
    final_df = pd.concat(processed_dfs, ignore_index=True)
    
    if debug:
        print(f"[main] File formats detected: {formats_found}")
        print(f"[main] Final stacked DataFrame shape: {final_df.shape}")
        print(f"[main] Final columns: {final_df.columns.tolist()}")
    else:
        print(f"File formats detected: {formats_found}")
        print("All files processed and stacked.")

    # Save final DataFrame to Parquet.
    output_dir = os.path.dirname(output_file)
    os.makedirs(output_dir, exist_ok=True)
    final_df.to_parquet(output_file, index=False)
    print(f"Final processed data saved to: {output_file}")
    
    return final_df


# Run the module when executed as a script.
if __name__ == "__main__":
    # Set debug=True for detailed output, or False for minimal output.
    processed_df = main(
        debug=True,
        input_folder='../../data/raw/emg_data/',  # Specify your input folder path here.
        output_file='../../data/processed/combined_emg_data.parquet'  # Specify your output file path here.
    )

[main] Found 4 CSV files in '../../data/raw/emg_data/'.

[process_file] Processing file: ../../data/raw/emg_data\2-14-25_bullpen_1.csv
[read_sensor_data_with_metadata] Format detected: FULL_FORMAT
[read_sensor_data_with_metadata] SensorGroups: ['FDS (81770)', 'FDS (81770)', 'FDS (81770)', 'FDS (81770)', 'FDS (81770)', 'FDS (81770)', 'FDS (81770)', 'FCU (81728)', 'FCU (81728)', 'FCU (81728)', 'FCU (81728)', 'FCU (81728)', 'FCU (81728)', 'FCU (81728)', 'FCR (81745)']
[read_sensor_data_with_metadata] SensorModes: ['sensor mode: 50', 'sensor mode: 50', 'sensor mode: 50', 'sensor mode: 50', 'sensor mode: 50', 'sensor mode: 50', 'sensor mode: 50', 'sensor mode: 50', 'sensor mode: 50', 'sensor mode: 50', 'sensor mode: 50', 'sensor mode: 50', 'sensor mode: 50', 'sensor mode: 50', 'sensor mode: 40']
[read_sensor_data_with_metadata] New column names set with sensor names.
[read_sensor_data_with_metadata] Using column names: ['EMG 1 (mV) - FDS', 'ACC X (G) - FDS', 'ACC Y (G) - FDS', 'ACC Z (G) - 

  df = pd.read_csv(StringIO(data_str), header=None, names=new_col_names)


[read_sensor_data_with_metadata] Missing ACC columns for FCR
[read_sensor_data_with_metadata] Missing GYRO columns for FCR
[read_sensor_data_with_metadata] Final DataFrame shape: (182675, 27)
[read_sensor_data_with_metadata] Final column names: ['EMG 1 (mV) - FDS', 'ACC X (G) - FDS', 'ACC Y (G) - FDS', 'ACC Z (G) - FDS', 'GYRO X (deg/s) - FDS', 'GYRO Y (deg/s) - FDS', 'GYRO Z (deg/s) - FDS', 'EMG 1 (mV) - FCU', 'ACC X (G) - FCU', 'ACC Y (G) - FCU', 'ACC Z (G) - FCU', 'GYRO X (deg/s) - FCU', 'GYRO Y (deg/s) - FCU', 'GYRO Z (deg/s) - FCU', 'EMG 1 (mV) - FCR', 'Timestamp', 'Application', 'Date/Time', 'Collection Length (seconds)', 'HasTimeSeries', 'FileFormat', 'ACC X (G) - FCR', 'ACC Y (G) - FCR', 'ACC Z (G) - FCR', 'GYRO X (deg/s) - FCR', 'GYRO Y (deg/s) - FCR', 'GYRO Z (deg/s) - FCR']
[process_file] Format: FULL_FORMAT
[process_file] DataFrame shape after reading: (182675, 27)
[process_file] Descriptive Statistics:
       EMG 1 (mV) - FDS  EMG 1 (mV) - FCU  EMG 1 (mV) - FCR  \
count   

  df = pd.read_csv(StringIO(data_str), header=None, names=new_col_names)


[read_sensor_data_with_metadata] Missing ACC columns for FCR
[read_sensor_data_with_metadata] Missing GYRO columns for FCR
[read_sensor_data_with_metadata] Final DataFrame shape: (2535557, 27)
[read_sensor_data_with_metadata] Final column names: ['EMG 1 (mV) - FDS', 'ACC X (G) - FDS', 'ACC Y (G) - FDS', 'ACC Z (G) - FDS', 'GYRO X (deg/s) - FDS', 'GYRO Y (deg/s) - FDS', 'GYRO Z (deg/s) - FDS', 'EMG 1 (mV) - FCU', 'ACC X (G) - FCU', 'ACC Y (G) - FCU', 'ACC Z (G) - FCU', 'GYRO X (deg/s) - FCU', 'GYRO Y (deg/s) - FCU', 'GYRO Z (deg/s) - FCU', 'EMG 1 (mV) - FCR', 'Timestamp', 'Application', 'Date/Time', 'Collection Length (seconds)', 'HasTimeSeries', 'FileFormat', 'ACC X (G) - FCR', 'ACC Y (G) - FCR', 'ACC Z (G) - FCR', 'GYRO X (deg/s) - FCR', 'GYRO Y (deg/s) - FCR', 'GYRO Z (deg/s) - FCR']
[process_file] Format: FULL_FORMAT
[process_file] DataFrame shape after reading: (2535557, 27)
[process_file] Descriptive Statistics:
       EMG 1 (mV) - FDS  EMG 1 (mV) - FCR                      Timest