# ROS Bag Data Visualization - Folder with Single Runs

This notebook provides comparative visualization for multiple single runs within a folder (e.g., room-generated-p1 with subfolders 0, 1, 2).

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import glob

#csv_file_paths = glob.glob('../downloaded_files/run-2025-09-17-172852/rooms-generated-p1/.cache/**/*.csv', recursive=True)
csv_file_paths = ['messages.csv']
print(f'Loading data from {len(csv_file_paths)} csv files.')

In [None]:
# Load and combine data from all runs
all_dataframes = []
run_names = []

for csv_file in csv_file_paths: 
    try:
        df = pd.read_csv(csv_file)
        run_name = Path(csv_file).stem
        df['run_id'] = run_name
        all_dataframes.append(df)
        run_names.append(run_name)
        print(f'Loaded {len(df)} rows from run: {run_name}')
    except Exception as e:
        print(f'Error loading {csv_file}: {e}')

if all_dataframes:
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    print(f'\nCombined dataset: {len(combined_df)} total rows from {len(run_names)} runs')
else:
    print('No data could be loaded')


In [None]:
# Create comparative visualizations
if all_dataframes and 'timestamp' in combined_df.columns:
    numeric_cols = combined_df.select_dtypes(include=[np.number]).columns
    numeric_cols = [col for col in numeric_cols if col != 'timestamp']
    
    if len(numeric_cols) > 0:
        # Plot comparison of runs for each numeric column
        fig, axes = plt.subplots(min(len(numeric_cols), 3), 1, figsize=(14, 4*min(len(numeric_cols), 3)))
        if not isinstance(axes, np.ndarray):
            axes = [axes]
        
        for i, col in enumerate(numeric_cols[:3]):
            for run_name in run_names:
                run_data = combined_df[combined_df['run_id'] == run_name]
                if len(run_data) > 0 and col in run_data.columns:
                    axes[i].plot(run_data['timestamp'], run_data[col], 
                               label=f'Run {run_name}', alpha=0.7, linewidth=1.5)
            
            axes[i].set_title(f'{col} - Comparison Across Runs')
            axes[i].set_ylabel(col)
            axes[i].legend()
            axes[i].grid(True, alpha=0.3)
        
        axes[-1].set_xlabel('Timestamp')
        plt.tight_layout()
        plt.show()
    else:
        print('No numeric columns found for visualization')
else:
    print('No suitable data for comparative visualization')

# Real Time Factor Analysis

Real-time factor analysis shows how well the simulation performs compared to real-time. A value of 1.0 means the simulation runs at exactly real-time speed, values < 1.0 indicate the simulation is slower than real-time, and values > 1.0 indicate faster than real-time execution.

In [None]:
# Real Time Factor Analysis
if all_dataframes and len(combined_df) > 0:
    # Check if we have real_time_factor data
    rtf_data = combined_df[combined_df['topic'] == '/gazebo/real_time_factor'] if 'topic' in combined_df.columns else combined_df
    
    if len(rtf_data) > 0:
        print(f"Found {len(rtf_data)} real_time_factor data points across all runs")
        
        # Get the value column (usually 'value' or the third column)
        value_col = None
        for col in ['value', 'data', rtf_data.columns[2] if len(rtf_data.columns) > 2 else None]:
            if col and col in rtf_data.columns:
                value_col = col
                break
        
        if value_col:
            # Convert to numeric if needed
            rtf_data[value_col] = pd.to_numeric(rtf_data[value_col], errors='coerce')
            rtf_data = rtf_data.dropna(subset=[value_col])
            
            print(f"Real-time factor statistics:")
            print(f"  Overall mean: {rtf_data[value_col].mean():.4f}")
            print(f"  Overall median: {rtf_data[value_col].median():.4f}")
            print(f"  Overall std: {rtf_data[value_col].std():.4f}")
            print(f"  Min: {rtf_data[value_col].min():.4f}")
            print(f"  Max: {rtf_data[value_col].max():.4f}")
            
            # Statistics per run
            print(f"\nReal-time factor by run:")
            rtf_by_run = rtf_data.groupby('run_id')[value_col].agg(['count', 'mean', 'median', 'std', 'min', 'max'])
            print(rtf_by_run)
            
        else:
            print("Could not find value column for real_time_factor data")
            print(f"Available columns: {list(rtf_data.columns)}")
    else:
        print("No real_time_factor data found")
        if 'topic' in combined_df.columns:
            unique_topics = combined_df['topic'].unique()
            print(f"Available topics: {list(unique_topics)}")
else:
    print("No data available for real_time_factor analysis")

In [None]:
# Real Time Factor Visualizations
if all_dataframes and len(combined_df) > 0:
    rtf_data = combined_df[combined_df['topic'] == '/gazebo/real_time_factor'] if 'topic' in combined_df.columns else combined_df
    
    if len(rtf_data) > 0:
        # Get the value column
        value_col = None
        for col in ['value', 'data', rtf_data.columns[2] if len(rtf_data.columns) > 2 else None]:
            if col and col in rtf_data.columns:
                value_col = col
                break
        
        if value_col and 'timestamp' in rtf_data.columns:
            rtf_data[value_col] = pd.to_numeric(rtf_data[value_col], errors='coerce')
            rtf_data = rtf_data.dropna(subset=[value_col])
            
            # Create visualizations
            fig, axes = plt.subplots(2, 2, figsize=(15, 12))
            
            # 1. Time series plot for all runs
            ax1 = axes[0, 0]
            unique_runs = rtf_data['run_id'].unique()[:10]  # Limit to first 10 runs for clarity
            colors = plt.cm.tab10(np.linspace(0, 1, len(unique_runs)))
            
            for i, run_id in enumerate(unique_runs):
                run_rtf = rtf_data[rtf_data['run_id'] == run_id]
                if len(run_rtf) > 0:
                    ax1.plot(run_rtf['timestamp'], run_rtf[value_col], 
                            alpha=0.7, linewidth=1, color=colors[i], label=f'Run {i+1}')
            
            ax1.axhline(y=1.0, color='red', linestyle='--', alpha=0.8, label='Real-time (1.0)')
            ax1.set_xlabel('Timestamp')
            ax1.set_ylabel('Real Time Factor')
            ax1.set_title('Real Time Factor Over Time (First 10 Runs)')
            ax1.grid(True, alpha=0.3)
            ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
            
            # 2. Distribution histogram
            ax2 = axes[0, 1]
            ax2.hist(rtf_data[value_col], bins=50, alpha=0.7, edgecolor='black')
            ax2.axvline(x=1.0, color='red', linestyle='--', alpha=0.8, label='Real-time (1.0)')
            ax2.axvline(x=rtf_data[value_col].mean(), color='green', linestyle='-', alpha=0.8, label=f'Mean ({rtf_data[value_col].mean():.3f})')
            ax2.set_xlabel('Real Time Factor')
            ax2.set_ylabel('Frequency')
            ax2.set_title('Real Time Factor Distribution')
            ax2.legend()
            ax2.grid(True, alpha=0.3)
            
            # 3. Box plot by run (first 15 runs)
            ax3 = axes[1, 0]
            rtf_by_run = []
            run_labels = []
            for i, run_id in enumerate(unique_runs[:15]):
                run_rtf = rtf_data[rtf_data['run_id'] == run_id][value_col]
                if len(run_rtf) > 0:
                    rtf_by_run.append(run_rtf)
                    run_labels.append(f'Run {i+1}')
            
            if rtf_by_run:
                bp = ax3.boxplot(rtf_by_run, labels=run_labels, patch_artist=True)
                for patch in bp['boxes']:
                    patch.set_facecolor('lightblue')
                    patch.set_alpha(0.7)
                ax3.axhline(y=1.0, color='red', linestyle='--', alpha=0.8, label='Real-time (1.0)')
                ax3.set_xlabel('Run')
                ax3.set_ylabel('Real Time Factor')
                ax3.set_title('Real Time Factor by Run (Box Plot)')
                ax3.grid(True, alpha=0.3)
                ax3.tick_params(axis='x', rotation=45)
                ax3.legend()
            
            # 4. Performance metrics summary
            ax4 = axes[1, 1]
            ax4.axis('off')
            
            # Calculate performance metrics
            rtf_stats = rtf_data.groupby('run_id')[value_col].agg(['mean', 'std', 'min', 'max']).round(4)
            
            performance_text = f"Real Time Factor Summary ({len(rtf_data)} data points, {len(unique_runs)} runs):\n\n"
            performance_text += f"Overall Statistics:\n"
            performance_text += f"  Mean: {rtf_data[value_col].mean():.4f}\n"
            performance_text += f"  Median: {rtf_data[value_col].median():.4f}\n"
            performance_text += f"  Std Dev: {rtf_data[value_col].std():.4f}\n"
            performance_text += f"  Min: {rtf_data[value_col].min():.4f}\n"
            performance_text += f"  Max: {rtf_data[value_col].max():.4f}\n\n"
            
            # Performance categories
            excellent = (rtf_data[value_col] >= 0.95).sum()
            good = ((rtf_data[value_col] >= 0.8) & (rtf_data[value_col] < 0.95)).sum()
            poor = (rtf_data[value_col] < 0.8).sum()
            
            performance_text += f"Performance Distribution:\n"
            performance_text += f"  Excellent (≥0.95): {excellent} ({excellent/len(rtf_data)*100:.1f}%)\n"
            performance_text += f"  Good (0.8-0.95): {good} ({good/len(rtf_data)*100:.1f}%)\n"
            performance_text += f"  Poor (<0.8): {poor} ({poor/len(rtf_data)*100:.1f}%)\n\n"
            
            performance_text += f"Run-wise Averages (first 10):\n"
            for i, (run_id, stats) in enumerate(rtf_stats.head(10).iterrows()):
                performance_text += f"  Run {i+1}: {stats['mean']:.4f} (±{stats['std']:.4f})\n"
            
            ax4.text(0.05, 0.95, performance_text, transform=ax4.transAxes, fontsize=10,
                    verticalalignment='top', fontfamily='monospace',
                    bbox=dict(boxstyle='round,pad=0.5', facecolor='lightgray', alpha=0.8))
            
            plt.tight_layout()
            plt.show()
            
        else:
            print(f"Missing required columns for visualization. Available: {list(rtf_data.columns)}")
    else:
        print("No real_time_factor data available for visualization")

In [None]:
# Enhanced Real Time Factor Analysis with Better Run Identification
if all_dataframes and len(combined_df) > 0:
    rtf_data = combined_df[combined_df['topic'] == '/gazebo/real_time_factor'].copy() if 'topic' in combined_df.columns else combined_df.copy()
    
    if len(rtf_data) > 0:
        # Get the value column
        value_col = None
        for col in ['value', 'data', rtf_data.columns[2] if len(rtf_data.columns) > 2 else None]:
            if col and col in rtf_data.columns:
                value_col = col
                break
        
        if value_col and 'timestamp' in rtf_data.columns:
            # Create better run identification using file paths
            rtf_data['file_path'] = ''
            for i, (df, csv_file) in enumerate(zip(all_dataframes, csv_file_paths)):
                if 'topic' in df.columns:
                    mask = df['topic'] == '/gazebo/real_time_factor'
                else:
                    mask = df.index >= 0  # all rows for non-topic data
                
                # Create a unique run identifier based on file name
                file_name = Path(csv_file).stem
                rtf_data.loc[rtf_data.index.isin(df[mask].index), 'file_path'] = file_name
            
            # Convert to numeric
            rtf_data[value_col] = pd.to_numeric(rtf_data[value_col], errors='coerce')
            rtf_data = rtf_data.dropna(subset=[value_col])
            
            # Analyze performance degradation over time
            print("\\n" + "="*80)
            print("DETAILED REAL TIME FACTOR ANALYSIS")
            print("="*80)
            
            # Time-based analysis - look for performance degradation
            rtf_data['timestamp_numeric'] = pd.to_numeric(rtf_data['timestamp'], errors='coerce')
            rtf_data = rtf_data.dropna(subset=['timestamp_numeric'])
            rtf_data = rtf_data.sort_values('timestamp_numeric')
            
            # Divide into time segments to analyze degradation
            time_segments = 10
            segment_size = len(rtf_data) // time_segments
            
            print(f"\nTime-based Performance Analysis (divided into {time_segments} segments):")
            print("-" * 60)
            for i in range(time_segments):
                start_idx = i * segment_size
                end_idx = (i + 1) * segment_size if i < time_segments - 1 else len(rtf_data)
                segment_data = rtf_data.iloc[start_idx:end_idx]
                
                if len(segment_data) > 0:
                    mean_rtf = segment_data[value_col].mean()
                    min_rtf = segment_data[value_col].min()
                    max_rtf = segment_data[value_col].max()
                    std_rtf = segment_data[value_col].std()
                    
                    start_time = segment_data['timestamp_numeric'].min()
                    end_time = segment_data['timestamp_numeric'].max()
                    
                    print(f"Segment {i+1:2d} (t={start_time:6.1f}-{end_time:6.1f}s): "
                          f"Mean={mean_rtf:.4f}, Min={min_rtf:.4f}, Max={max_rtf:.4f}, Std={std_rtf:.4f}")
            
            # Identify problematic periods
            poor_performance = rtf_data[rtf_data[value_col] < 0.5]
            if len(poor_performance) > 0:
                print(f"\nPoor Performance Periods (RTF < 0.5): {len(poor_performance)} data points")
                print(f"Worst RTF: {poor_performance[value_col].min():.4f} at timestamp {poor_performance.loc[poor_performance[value_col].idxmin(), 'timestamp_numeric']:.1f}s")
            
            # Performance stability analysis
            rolling_mean = rtf_data[value_col].rolling(window=100, min_periods=1).mean()
            rolling_std = rtf_data[value_col].rolling(window=100, min_periods=1).std()
            
            print(f"\nPerformance Stability Analysis:")
            print(f"Average rolling standard deviation (100-point window): {rolling_std.mean():.4f}")
            print(f"Performance coefficient of variation: {rtf_data[value_col].std() / rtf_data[value_col].mean():.4f}")
            
            # System performance recommendations
            print(f"\nSystem Performance Assessment:")
            excellent_pct = (rtf_data[value_col] >= 0.95).mean() * 100
            good_pct = ((rtf_data[value_col] >= 0.8) & (rtf_data[value_col] < 0.95)).mean() * 100
            fair_pct = ((rtf_data[value_col] >= 0.5) & (rtf_data[value_col] < 0.8)).mean() * 100
            poor_pct = (rtf_data[value_col] < 0.5).mean() * 100
            
            print(f"  Excellent (≥0.95): {excellent_pct:.1f}%")
            print(f"  Good (0.8-0.94):   {good_pct:.1f}%") 
            print(f"  Fair (0.5-0.79):   {fair_pct:.1f}%")
            print(f"  Poor (<0.5):       {poor_pct:.1f}%")
            
            if excellent_pct > 70:
                print("\n✅ ASSESSMENT: System performance is EXCELLENT")
            elif excellent_pct + good_pct > 80:
                print("\n✅ ASSESSMENT: System performance is GOOD")
            elif poor_pct < 10:
                print("\n⚠️  ASSESSMENT: System performance is ACCEPTABLE but could be improved")
            else:
                print("\\n❌ ASSESSMENT: System performance has SIGNIFICANT ISSUES")
                print("   Recommendations:")
                print("   - Consider hardware upgrades (CPU, RAM)")
                print("   - Optimize simulation parameters")
                print("   - Reduce simulation complexity")
                print("   - Check for resource bottlenecks")
            
        else:
            print(f"Missing required columns for enhanced analysis. Available: {list(rtf_data.columns)}")
    else:
        print("No real_time_factor data available for enhanced analysis")

In [None]:
# Real Time Factor Comparison Across Run Types
if all_dataframes and len(combined_df) > 0:
    rtf_data = combined_df[combined_df['topic'] == '/gazebo/real_time_factor'].copy() if 'topic' in combined_df.columns else combined_df.copy()
    
    if len(rtf_data) > 0:
        value_col = None
        for col in ['value', 'data', rtf_data.columns[2] if len(rtf_data.columns) > 2 else None]:
            if col and col in rtf_data.columns:
                value_col = col
                break
        
        if value_col:
            rtf_data[value_col] = pd.to_numeric(rtf_data[value_col], errors='coerce')
            rtf_data = rtf_data.dropna(subset=[value_col])
            
            # Extract run characteristics from file paths
            run_characteristics = []
            for csv_file in csv_file_paths:
                file_name = Path(csv_file).stem
                
                # Extract characteristics from filename
                characteristics = {
                    'file': file_name,
                    'scenario_type': 'unknown',
                    'parameters': 'unknown'
                }
                
                # Try to extract scenario type and parameters
                if 'rooms' in file_name.lower():
                    characteristics['scenario_type'] = 'rooms'
                elif 'circle' in file_name.lower():
                    characteristics['scenario_type'] = 'circle'
                elif 'hallway' in file_name.lower():
                    characteristics['scenario_type'] = 'hallway'
                elif 'large-room' in file_name.lower():
                    characteristics['scenario_type'] = 'large-room'
                
                # Extract parameter information
                if '-p1-' in file_name:
                    characteristics['parameters'] = 'p1'
                elif '-p2-' in file_name:
                    characteristics['parameters'] = 'p2'
                elif '-p3-' in file_name:
                    characteristics['parameters'] = 'p3'
                
                run_characteristics.append(characteristics)
            
            # Create final summary visualization
            fig, axes = plt.subplots(1, 3, figsize=(18, 6))
            
            # 1. Overall performance timeline with rolling average
            ax1 = axes[0]
            if 'timestamp' in rtf_data.columns:
                rtf_data['timestamp_numeric'] = pd.to_numeric(rtf_data['timestamp'], errors='coerce')
                rtf_data = rtf_data.sort_values('timestamp_numeric')
                
                # Plot raw data
                ax1.scatter(rtf_data['timestamp_numeric'], rtf_data[value_col], 
                           alpha=0.1, s=1, color='lightblue', label='Individual measurements')
                
                # Plot rolling average
                window_size = max(100, len(rtf_data) // 100)
                rolling_mean = rtf_data[value_col].rolling(window=window_size, min_periods=1).mean()
                ax1.plot(rtf_data['timestamp_numeric'], rolling_mean, 
                        color='darkblue', linewidth=2, label=f'Rolling average ({window_size} pts)')
                
                ax1.axhline(y=1.0, color='red', linestyle='--', alpha=0.8, label='Real-time (1.0)')
                ax1.axhline(y=0.8, color='orange', linestyle=':', alpha=0.8, label='Acceptable (0.8)')
                
                ax1.set_xlabel('Timestamp (seconds)')
                ax1.set_ylabel('Real Time Factor')
                ax1.set_title('Real Time Factor Performance Timeline')
                ax1.legend()
                ax1.grid(True, alpha=0.3)
            
            # 2. Performance distribution with statistics
            ax2 = axes[1]
            n, bins, patches = ax2.hist(rtf_data[value_col], bins=50, alpha=0.7, 
                                       edgecolor='black', color='skyblue')
            
            # Color code the histogram bars
            for i, patch in enumerate(patches):
                bin_center = (bins[i] + bins[i+1]) / 2
                if bin_center >= 0.95:
                    patch.set_facecolor('green')
                    patch.set_alpha(0.8)
                elif bin_center >= 0.8:
                    patch.set_facecolor('orange')
                    patch.set_alpha(0.8)
                elif bin_center < 0.5:
                    patch.set_facecolor('red')
                    patch.set_alpha(0.8)
            
            ax2.axvline(x=1.0, color='red', linestyle='--', alpha=0.8, linewidth=2, label='Real-time (1.0)')
            ax2.axvline(x=rtf_data[value_col].mean(), color='purple', linestyle='-', 
                       alpha=0.8, linewidth=2, label=f'Mean ({rtf_data[value_col].mean():.3f})')
            ax2.axvline(x=rtf_data[value_col].median(), color='brown', linestyle='-', 
                       alpha=0.8, linewidth=2, label=f'Median ({rtf_data[value_col].median():.3f})')
            
            ax2.set_xlabel('Real Time Factor')
            ax2.set_ylabel('Frequency')
            ax2.set_title('Real Time Factor Distribution')
            ax2.legend()
            ax2.grid(True, alpha=0.3)
            
            # 3. Summary statistics table
            ax3 = axes[2]
            ax3.axis('off')
            
            # Calculate comprehensive statistics
            stats_text = "REAL TIME FACTOR SUMMARY\n"
            stats_text += "=" * 30 + "\n\n"
            
            stats_text += f"Dataset Overview:\n"
            stats_text += f"  Total measurements: {len(rtf_data):,}\n"
            stats_text += f"  Total runs: {len(csv_file_paths)}\n"
            stats_text += f"  Time span: {rtf_data['timestamp_numeric'].max() - rtf_data['timestamp_numeric'].min():.1f} seconds\n\n"
            
            stats_text += f"Performance Metrics:\n"
            stats_text += f"  Mean RTF: {rtf_data[value_col].mean():.4f}\n"
            stats_text += f"  Median RTF: {rtf_data[value_col].median():.4f}\n"
            stats_text += f"  Std Deviation: {rtf_data[value_col].std():.4f}\n"
            stats_text += f"  Min RTF: {rtf_data[value_col].min():.4f}\n"
            stats_text += f"  Max RTF: {rtf_data[value_col].max():.4f}\n\n"
            
            # Percentiles
            stats_text += f"Performance Percentiles:\n"
            for p in [10, 25, 50, 75, 90, 95, 99]:
                val = np.percentile(rtf_data[value_col], p)
                stats_text += f"  {p:2d}th percentile: {val:.4f}\n"
            
            stats_text += f"\nPerformance Categories:\n"
            excellent = (rtf_data[value_col] >= 0.95).sum()
            good = ((rtf_data[value_col] >= 0.8) & (rtf_data[value_col] < 0.95)).sum()
            fair = ((rtf_data[value_col] >= 0.5) & (rtf_data[value_col] < 0.8)).sum()
            poor = (rtf_data[value_col] < 0.5).sum()
            
            stats_text += f" Excellent (≥0.95): {excellent:,} ({excellent/len(rtf_data)*100:.1f}%)\n"
            stats_text += f" Good (0.8-0.95): {good:,} ({good/len(rtf_data)*100:.1f}%)\n"
            stats_text += f" Fair (0.5-0.8): {fair:,} ({fair/len(rtf_data)*100:.1f}%)\n"
            stats_text += f" Poor (<0.5): {poor:,} ({poor/len(rtf_data)*100:.1f}%)\n"
            
            # Final assessment
            if excellent/len(rtf_data) > 0.7:
                stats_text += f"\\n Overall: EXCELLENT performance\n"
            elif (excellent + good)/len(rtf_data) > 0.8:
                stats_text += f"\\n Overall: GOOD performance\n"  
            elif poor/len(rtf_data) < 0.1:
                stats_text += f"\\n Overall: ACCEPTABLE performance\n"
            else:
                stats_text += f"\\n Overall: POOR performance\n"
            
            ax3.text(0.05, 0.95, stats_text, transform=ax3.transAxes, fontsize=10,
                    verticalalignment='top', fontfamily='monospace',
                    bbox=dict(boxstyle='round,pad=0.5', facecolor='lightgray', alpha=0.9))
            
            plt.tight_layout()
            plt.show()
            
            print(f"\n🎯 REAL TIME FACTOR ANALYSIS COMPLETE")
            print(f"   Analyzed {len(rtf_data):,} measurements from {len(csv_file_paths)} runs")
            print(f"   Overall system performance rated as: {'EXCELLENT' if excellent/len(rtf_data) > 0.7 else 'GOOD' if (excellent + good)/len(rtf_data) > 0.8 else 'ACCEPTABLE' if poor/len(rtf_data) < 0.1 else 'POOR'}")
            
        else:
            print("Could not find value column for comparison analysis")
    else:
        print("No real_time_factor data available for comparison analysis")

# Movement Duration Analysis

This section analyzes the duration of movement for each run by calculating the derivative of position (linear speed) and yaw (angular speed). Movement is detected when either linear speed (from x/y position derivatives) or angular speed (from yaw derivative) exceeds 0.1 units. The movement duration is calculated from the first time movement is detected until the last time it stays below the threshold.

In [None]:
# Movement Duration Analysis - Calculate movement periods for each run
import pandas as pd
import numpy as np
from pathlib import Path

def calculate_movement_duration(df, speed_threshold=0.1):
    """
    Calculate movement duration based on linear and angular velocity
    Returns start_time, end_time, duration in seconds
    """
    if len(df) < 2:
        return None, None, 0.0
    
    # Sort by timestamp to ensure proper derivative calculation
    df_sorted = df.sort_values('timestamp').copy()
    
    # Convert timestamp to numeric if needed
    df_sorted['timestamp_numeric'] = pd.to_numeric(df_sorted['timestamp'], errors='coerce')
    df_sorted = df_sorted.dropna(subset=['timestamp_numeric'])
    
    if len(df_sorted) < 2:
        return None, None, 0.0
    
    # Look for position data (x, y coordinates and yaw/orientation)
    pos_cols = [col for col in df_sorted.columns if any(pattern in col.lower() for pattern in ['pose', 'position', 'x', 'y', 'yaw', 'orientation', 'theta'])]
    
    if not pos_cols:
        print(f"No position columns found in data. Available columns: {list(df_sorted.columns)}")
        return None, None, 0.0
    
    # Try different approaches to find movement data
    movement_detected = pd.Series([False] * len(df_sorted), index=df_sorted.index)
    
    # Method 1: Look for pose/position topics
    pose_topics = ['/amcl_pose', '/robot_pose', '/base_link', '/odom', '/pose']
    for topic in pose_topics:
        if 'topic' in df_sorted.columns:
            topic_data = df_sorted[df_sorted['topic'] == topic].copy()
            if len(topic_data) > 1:
                # print(f"Found {len(topic_data)} data points for topic: {topic}")
                movement_detected |= detect_movement_from_pose_data(topic_data, speed_threshold)
    
    # Method 2: Look for velocity topics directly
    vel_topics = ['/cmd_vel', '/base_controller/cmd_vel', '/mobile_base_controller/cmd_vel', '/velocity']
    for topic in vel_topics:
        if 'topic' in df_sorted.columns:
            topic_data = df_sorted[df_sorted['topic'] == topic].copy()
            if len(topic_data) > 1:
                # print(f"Found {len(topic_data)} velocity data points for topic: {topic}")
                movement_detected |= detect_movement_from_velocity_data(topic_data, speed_threshold)
    
    # Method 3: If we have numeric columns that might be coordinates
    if not movement_detected.any():
        numeric_cols = df_sorted.select_dtypes(include=[np.number]).columns
        potential_pos_cols = [col for col in numeric_cols if col not in ['timestamp', 'timestamp_numeric']]
        
        if len(potential_pos_cols) >= 2:  # At least x, y
            # print(f"Trying to detect movement from numeric columns: {potential_pos_cols[:3]}")
            movement_detected = detect_movement_from_numeric_cols(df_sorted, potential_pos_cols, speed_threshold)
    
    if not movement_detected.any():
        # print("No movement detected in data")
        return None, None, 0.0
    
    # Find first and last movement
    moving_indices = df_sorted.index[movement_detected]
    if len(moving_indices) == 0:
        return None, None, 0.0
    
    start_idx = moving_indices[0]
    end_idx = moving_indices[-1]
    
    start_time = df_sorted.loc[start_idx, 'timestamp_numeric']
    end_time = df_sorted.loc[end_idx, 'timestamp_numeric']
    duration = end_time - start_time
    
    return start_time, end_time, duration

def detect_movement_from_pose_data(pose_data, threshold):
    """Detect movement from pose/position data"""
    movement = pd.Series([False] * len(pose_data), index=pose_data.index)
    
    if 'value' in pose_data.columns:
        # Try to parse pose data from value column
        try:
            # This might contain position/orientation data
            pose_data['value_numeric'] = pd.to_numeric(pose_data['value'], errors='coerce')
            if pose_data['value_numeric'].notna().any():
                vel = np.abs(np.diff(pose_data['value_numeric'].fillna(0)))
                vel = np.append(vel, vel[-1] if len(vel) > 0 else 0)
                movement = vel > threshold
        except:
            pass
    
    return pd.Series(movement, index=pose_data.index)

def detect_movement_from_velocity_data(vel_data, threshold):
    """Detect movement from velocity data"""
    movement = pd.Series([False] * len(vel_data), index=vel_data.index)
    
    if 'value' in vel_data.columns:
        try:
            vel_data['value_numeric'] = pd.to_numeric(vel_data['value'], errors='coerce')
            movement = np.abs(vel_data['value_numeric'].fillna(0)) > threshold
        except:
            pass
    
    return pd.Series(movement, index=vel_data.index)

def detect_movement_from_numeric_cols(df, cols, threshold):
    """Detect movement from numeric columns by calculating derivatives"""
    movement = pd.Series([False] * len(df), index=df.index)
    
    for col in cols[:3]:  # Check up to 3 columns
        try:
            values = df[col].fillna(0)
            if values.std() > 0:  # Only if there's variation
                # Calculate derivative (velocity)
                dt = np.diff(df['timestamp_numeric'])
                dt = np.where(dt == 0, 1e-6, dt)  # Avoid division by zero
                vel = np.abs(np.diff(values)) / dt
                vel = np.append(vel, vel[-1] if len(vel) > 0 else 0)
                movement |= vel > threshold
        except Exception as e:
            print(f"Error processing column {col}: {e}")
            continue
    
    return movement

# Analyze movement duration for all runs
print("Analyzing movement duration for all runs...")
print("="*60)

movement_results = []

for i, (df, csv_file) in enumerate(zip(all_dataframes, csv_file_paths)):
    file_name = Path(csv_file).stem
    # print(f"\nAnalyzing run {i+1}/{len(all_dataframes)}: {file_name}")
    
    start_time, end_time, duration = calculate_movement_duration(df)
    
    movement_results.append({
        'run_id': i + 1,
        'file_name': file_name,
        'csv_path': csv_file,
        'start_time': start_time,
        'end_time': end_time,
        'duration_seconds': duration,
        'duration_minutes': duration / 60.0 if duration > 0 else 0.0,
        'total_data_points': len(df),
        'has_movement': duration > 0
    })
    
    if duration > 0:
        pass
        # print(f"  ✅ Movement detected: {duration:.2f} seconds ({duration/60:.2f} minutes)")
        # print(f"     Start: {start_time:.1f}s, End: {end_time:.1f}s")
    else:
        print(f"  ❌ No movement detected")

# Create summary DataFrame
movement_df = pd.DataFrame(movement_results)
print(f"\n" + "="*60)
print(f"MOVEMENT DURATION SUMMARY")
print(f"="*60)
print(f"Total runs analyzed: {len(movement_results)}")
print(f"Runs with detected movement: {movement_df['has_movement'].sum()}")
print(f"Runs without detected movement: {(~movement_df['has_movement']).sum()}")

if movement_df['has_movement'].any():
    moving_runs = movement_df[movement_df['has_movement']]
    print(f"\nMovement Statistics:")
    print(f"  Average duration: {moving_runs['duration_seconds'].mean():.2f} seconds ({moving_runs['duration_minutes'].mean():.2f} minutes)")
    print(f"  Median duration: {moving_runs['duration_seconds'].median():.2f} seconds ({moving_runs['duration_minutes'].median():.2f} minutes)")
    print(f"  Min duration: {moving_runs['duration_seconds'].min():.2f} seconds")
    print(f"  Max duration: {moving_runs['duration_seconds'].max():.2f} seconds")
    print(f"  Std deviation: {moving_runs['duration_seconds'].std():.2f} seconds")

movement_df

In [None]:
# Movement Duration Visualizations
if len(movement_df) > 0:
    moving_runs = movement_df[movement_df['has_movement']]
    
    if len(moving_runs) > 0:
        # Create comprehensive visualization
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # 1. Bar chart of movement durations
        ax1 = axes[0, 0]
        run_indices = range(len(moving_runs))
        bars = ax1.bar(run_indices, moving_runs['duration_minutes'], 
                      color='skyblue', alpha=0.7, edgecolor='navy')
        
        # Add value labels on bars
        for i, (idx, row) in enumerate(moving_runs.iterrows()):
            ax1.text(i, row['duration_minutes'] + 0.1, f"{row['duration_minutes']:.1f}m",
                    ha='center', va='bottom', fontsize=8)
        
        ax1.set_xlabel('Run Number')
        ax1.set_ylabel('Movement Duration (minutes)')
        ax1.set_title(f'Movement Duration by Run ({len(moving_runs)} runs with movement)')
        ax1.grid(True, alpha=0.3)
        
        # Set x-tick labels to show run IDs
        ax1.set_xticks(run_indices[::max(1, len(run_indices)//10)])  # Show every nth tick
        ax1.set_xticklabels([f"R{moving_runs.iloc[i]['run_id']}" for i in run_indices[::max(1, len(run_indices)//10)]], rotation=45)
        
        # 2. Histogram of movement durations
        ax2 = axes[0, 1]
        n_bins = min(20, len(moving_runs) // 2 + 1)
        n, bins, patches = ax2.hist(moving_runs['duration_minutes'], bins=n_bins, 
                                   alpha=0.7, color='lightgreen', edgecolor='darkgreen')
        
        # Color code histogram bars
        mean_duration = moving_runs['duration_minutes'].mean()
        for i, patch in enumerate(patches):
            bin_center = (bins[i] + bins[i+1]) / 2
            if bin_center > mean_duration * 1.2:
                patch.set_facecolor('orange')
            elif bin_center < mean_duration * 0.8:
                patch.set_facecolor('lightblue')
        
        ax2.axvline(x=mean_duration, color='red', linestyle='--', 
                   label=f'Mean: {mean_duration:.1f} min')
        ax2.axvline(x=moving_runs['duration_minutes'].median(), color='purple', linestyle=':', 
                   label=f'Median: {moving_runs["duration_minutes"].median():.1f} min')
        
        ax2.set_xlabel('Movement Duration (minutes)')
        ax2.set_ylabel('Number of Runs')
        ax2.set_title('Distribution of Movement Durations')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        # 3. Movement success rate and statistics
        ax3 = axes[1, 0]
        success_rate = movement_df['has_movement'].mean() * 100
        
        # Pie chart of movement detection
        labels = ['Movement Detected', 'No Movement']
        sizes = [movement_df['has_movement'].sum(), (~movement_df['has_movement']).sum()]
        colors = ['lightgreen', 'lightcoral']
        explode = (0.05, 0)  # explode the first slice
        
        wedges, texts, autotexts = ax3.pie(sizes, labels=labels, colors=colors, explode=explode,
                                          autopct='%1.1f%%', shadow=True, startangle=90)
        ax3.set_title(f'Movement Detection Success Rate\\n({len(movement_df)} total runs)')
        
        # 4. Detailed statistics table
        ax4 = axes[1, 1]
        ax4.axis('off')
        
        stats_text = "MOVEMENT DURATION ANALYSIS\n"
        stats_text += "=" * 35 + "\n\n"
        
        stats_text += f"Dataset Overview:\n"
        stats_text += f"  Total runs: {len(movement_df)}\n"
        stats_text += f"  Runs with movement: {len(moving_runs)} ({success_rate:.1f}%)\n"
        stats_text += f"  Runs without movement: {len(movement_df) - len(moving_runs)}\n\n"
        
        if len(moving_runs) > 0:
            stats_text += f"Duration Statistics (moving runs only):\n"
            stats_text += f"  Mean: {moving_runs['duration_minutes'].mean():.2f} minutes\n"
            stats_text += f"  Median: {moving_runs['duration_minutes'].median():.2f} minutes\n"
            stats_text += f"  Std Dev: {moving_runs['duration_minutes'].std():.2f} minutes\n"
            stats_text += f"  Min: {moving_runs['duration_minutes'].min():.2f} minutes\n"
            stats_text += f"  Max: {moving_runs['duration_minutes'].max():.2f} minutes\n\n"
            
            stats_text += f"Duration Categories:\n"
            short_runs = (moving_runs['duration_minutes'] < 2).sum()
            medium_runs = ((moving_runs['duration_minutes'] >= 2) & 
                          (moving_runs['duration_minutes'] < 5)).sum()
            long_runs = (moving_runs['duration_minutes'] >= 5).sum()
            
            stats_text += f"  Short (<2 min): {short_runs} runs\n"
            stats_text += f"  Medium (2-5 min): {medium_runs} runs\n"
            stats_text += f"  Long (≥5 min): {long_runs} runs\n\n"
            
            # Find interesting cases
            if len(moving_runs) > 1:
                shortest_run = moving_runs.loc[moving_runs['duration_minutes'].idxmin()]
                longest_run = moving_runs.loc[moving_runs['duration_minutes'].idxmax()]
                
                stats_text += f"Extreme Cases:\n"
                stats_text += f"  Shortest: Run {shortest_run['run_id']} ({shortest_run['duration_minutes']:.2f} min)\n"
                stats_text += f"  Longest: Run {longest_run['run_id']} ({longest_run['duration_minutes']:.2f} min)\n"
        
        ax4.text(0.05, 0.95, stats_text, transform=ax4.transAxes, fontsize=11,
                verticalalignment='top', fontfamily='monospace',
                bbox=dict(boxstyle='round,pad=0.5', facecolor='lightyellow', alpha=0.9))
        
        plt.tight_layout()
        plt.show()
        
        # Print detailed run-by-run results
        print(f"\n📊 DETAILED MOVEMENT DURATION RESULTS")
        print("=" * 70)
        
        if len(moving_runs) > 0:
            print("\nRuns with detected movement:")
            print("-" * 70)
            for _, run in moving_runs.iterrows():
                print(f"Run {run['run_id']:3d}: {run['duration_minutes']:6.2f} min ({run['duration_seconds']:7.1f}s) - {run['file_name']}")
        
        no_movement_runs = movement_df[~movement_df['has_movement']]
        if len(no_movement_runs) > 0:
            print("\nRuns without detected movement:")
            print("-" * 70)
            for _, run in no_movement_runs.iterrows():
                print(f"Run {run['run_id']:3d}: No movement detected - {run['file_name']}")
        
        print(f"\n✅ Analysis complete! {success_rate:.1f}% of runs had detectable movement.")
        
    else:
        print("❌ No runs with detectable movement found.")
        print("\nPossible reasons:")
        print("- No position/velocity data in the CSV files")
        print("- Movement speed below threshold (0.1 units/s)")
        print("- Data format not recognized")
        print("\nTry checking the data structure and topics available.")
        
else:
    print("❌ No movement analysis data available.")

In [None]:
# Create detailed comparison table and export results
if len(movement_df) > 0 and movement_df['has_movement'].any():
    # Create a detailed comparison table
    print("📋 MOVEMENT DURATION COMPARISON TABLE")
    print("=" * 80)
    
    # Sort by duration for better comparison
    comparison_df = movement_df[movement_df['has_movement']].copy()
    comparison_df = comparison_df.sort_values('duration_seconds')
    
    # Add ranking and performance categories
    comparison_df['rank'] = range(1, len(comparison_df) + 1)
    comparison_df['performance_category'] = comparison_df['duration_minutes'].apply(
        lambda x: 'Short (< 0.5 min)' if x < 0.5 
        else 'Quick (0.5-0.7 min)' if x < 0.7
        else 'Medium (0.7-1.0 min)' if x < 1.0
        else 'Long (≥ 1.0 min)'
    )
    
    # Create formatted display table
    display_cols = ['rank', 'run_id', 'file_name', 'duration_minutes', 'duration_seconds', 'performance_category']
    display_df = comparison_df[display_cols].copy()
    display_df.columns = ['Rank', 'Run ID', 'File Name', 'Duration (min)', 'Duration (sec)', 'Category']
    
    # Round durations for better display
    display_df['Duration (min)'] = display_df['Duration (min)'].round(2)
    display_df['Duration (sec)'] = display_df['Duration (sec)'].round(1)
    
    print("\nMovement Duration Rankings (shortest to longest):")
    print("-" * 80)
    
    # Print table with formatting
    header = f"{'Rank':>4} {'Run':>4} {'Duration':>12} {'Duration':>12} {'Category':>20} {'File Name':>15}"
    print(header)
    print("-" * 80)
    
    for _, row in display_df.iterrows():
        print(f"{row['Rank']:4d} {row['Run ID']:4d} {row['Duration (min)']:8.2f} min {row['Duration (sec)']:8.1f} sec "
              f"{row['Category']:>20} {row['File Name'][:15]:>15}")
    
    # Statistical summary
    print(f"\n📈 STATISTICAL SUMMARY")
    print("=" * 40)
    
    durations = comparison_df['duration_seconds']
    print(f"Number of runs with movement: {len(comparison_df)}")
    print(f"Average duration: {durations.mean():.2f} ± {durations.std():.2f} seconds")
    print(f"Median duration: {durations.median():.2f} seconds")
    print(f"Range: {durations.min():.1f} - {durations.max():.1f} seconds")
    print(f"Duration span: {durations.max() - durations.min():.1f} seconds")
    
    # Category breakdown
    print(f"\n🏷️  CATEGORY BREAKDOWN")
    print("-" * 30)
    category_counts = comparison_df['performance_category'].value_counts()
    for category, count in category_counts.items():
        percentage = (count / len(comparison_df)) * 100
        print(f"{category:>20}: {count:2d} runs ({percentage:5.1f}%)")
    
    # Identify potential outliers
    q1 = durations.quantile(0.25)
    q3 = durations.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    
    outliers = comparison_df[(durations < lower_bound) | (durations > upper_bound)]
    
    if len(outliers) > 0:
        print(f"\n⚠️  POTENTIAL OUTLIERS ({len(outliers)} runs)")
        print("-" * 40)
        for _, outlier in outliers.iterrows():
            outlier_type = "Short" if outlier['duration_seconds'] < lower_bound else "Long"
            print(f"Run {outlier['run_id']:3d}: {outlier['duration_seconds']:6.1f}s ({outlier_type}) - {outlier['file_name']}")
    else:
        print(f"\n✅ No significant outliers detected")
    
    # Export results to CSV if needed
    export_df = comparison_df[['run_id', 'file_name', 'start_time', 'end_time', 
                              'duration_seconds', 'duration_minutes', 'performance_category']].copy()
    
    print(f"\n💾 Results summary prepared for export ({len(export_df)} runs)")
    print("   Available data: run_id, file_name, start_time, end_time, duration_seconds, duration_minutes, category")
    
    # Show final summary
    most_consistent = comparison_df.iloc[len(comparison_df)//2] if len(comparison_df) > 1 else comparison_df.iloc[0]
    fastest_run = comparison_df.iloc[0]
    slowest_run = comparison_df.iloc[-1]
    
    print(f"\n🎯 KEY FINDINGS")
    print("=" * 40)
    print(f"Fastest run: Run {fastest_run['run_id']} ({fastest_run['duration_seconds']:.1f}s)")
    print(f"Slowest run: Run {slowest_run['run_id']} ({slowest_run['duration_seconds']:.1f}s)")
    print(f"Most typical: Run {most_consistent['run_id']} ({most_consistent['duration_seconds']:.1f}s)")
    print(f"Time variation: {((slowest_run['duration_seconds'] - fastest_run['duration_seconds']) / fastest_run['duration_seconds'] * 100):.1f}% difference between fastest and slowest")
    
    # Store results in variable for potential further analysis
    movement_comparison_results = export_df
    
else:
    print("❌ No movement data available for detailed comparison")