In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from pathlib import Path

def analyze_length_measurements():
    """
    Analyze the length measurements data, separating by pond type and analyzing
    spatial distributions.
    """
    # Load the data
    csv_path = Path("/Users/gilbenor/Documents/code projects/msc/counting_research_algorithms/runs/pose/predict57/length_analysis.csv")
    if not csv_path.exists():
        print(f"Error: Could not find {csv_path}")
        return
        
    df = pd.read_csv(csv_path)
    print(f"Loaded {len(df)} images from {csv_path}")
    
    # Create separate dataframes for big and small exuviae
    # Each row in the original dataset has either big or small or both
    # We need to reshape to have one exuvia per row for proper analysis
    
    # Prepare big exuviae data
    big_data = []
    for _, row in df.iterrows():
        if pd.notna(row['big_total_length']):
            big_data.append({
                'image_name': row['image_name'],
                'total_length': row['big_total_length'],
                'carapace_length': row['big_carapace_length'],
                'eye_x': row['big_eye_x'],
                'eye_y': row['big_eye_y'],
                'size': 'BIG'
            })
    
    # Prepare small exuviae data
    small_data = []
    for _, row in df.iterrows():
        if pd.notna(row['small_total_length']):
            small_data.append({
                'image_name': row['image_name'],
                'total_length': row['small_total_length'],
                'carapace_length': row['small_carapace_length'],
                'eye_x': row['small_eye_x'],
                'eye_y': row['small_eye_y'],
                'size': 'SMALL'
            })
    
    # Combine into a single dataframe
    analysis_df = pd.DataFrame(big_data + small_data)
    
    # Add pond type based on image name
    analysis_df['pond_type'] = analysis_df['image_name'].apply(
        lambda x: 'Circle' if 'GX010191' in x else 'Square')
    
    # Calculate ratio between total and carapace length
    analysis_df['length_ratio'] = analysis_df['total_length'] / analysis_df['carapace_length']
    
    # Calculate normalized position (0-1 range for both x and y)
    # Assuming original image is 5312x2988
    analysis_df['norm_x'] = analysis_df['eye_x'] / 5312
    analysis_df['norm_y'] = analysis_df['eye_y'] / 2988
    
    # Calculate distance from center
    analysis_df['center_dist'] = np.sqrt(
        (analysis_df['norm_x'] - 0.5)**2 + (analysis_df['norm_y'] - 0.5)**2)
    
    # Print basic statistics
    print("\n===== Basic Statistics =====")
    print(f"Total detections: {len(analysis_df)}")
    print(f"Big exuviae: {len(big_data)}")
    print(f"Small exuviae: {len(small_data)}")
    
    # Statistics by pond type
    pond_stats = analysis_df.groupby(['pond_type', 'size']).agg({
        'total_length': ['count', 'mean', 'std', 'min', 'max'],
        'carapace_length': ['mean', 'std'],
        'length_ratio': ['mean', 'std']
    })
    
    print("\n===== Statistics by Pond Type =====")
    print(pond_stats)
    
    # Create visualizations
    create_visualizations(analysis_df)

def create_visualizations(df):
    """Create interactive Plotly visualizations for the data"""
    output_dir = Path("/Users/gilbenor/Documents/code projects/msc/counting_research_algorithms/runs/pose/predict57/analysis_plots")
    output_dir.mkdir(exist_ok=True)
    
    # 1. Length distributions by pond type and size
    fig_dist = px.histogram(
        df, x='total_length', color='size', facet_row='pond_type',
        marginal='box', opacity=0.7, barmode='overlay',
        title='Length Distributions by Pond Type and Size Category',
        labels={'total_length': 'Total Length (mm)', 'size': 'Size Category'}
    )
    fig_dist.update_layout(height=800)
    fig_dist.write_html(output_dir / "length_distributions.html")
    
    # 2. Carapace to total length ratio distributions
    fig_ratio = px.box(
        df, x='pond_type', y='length_ratio', color='size',
        title='Carapace to Total Length Ratio by Pond Type',
        labels={'length_ratio': 'Total Length / Carapace Length', 
                'pond_type': 'Pond Type',
                'size': 'Size Category'}
    )
    fig_ratio.write_html(output_dir / "length_ratios.html")
    
    # 3. Spatial distribution of detections
    fig_spatial = px.scatter(
        df, x='norm_x', y='norm_y', color='size', 
        symbol='pond_type', size='total_length',
        hover_data=['image_name', 'total_length', 'carapace_length'],
        title='Spatial Distribution of Detections',
        labels={'norm_x': 'Normalized X Position', 
                'norm_y': 'Normalized Y Position',
                'size': 'Size Category',
                'pond_type': 'Pond Type'}
    )
    # Add image frame for reference
    fig_spatial.add_shape(type="rect", 
        x0=0, y0=0, x1=1, y1=1, 
        line=dict(color="Black", width=2),
        fillcolor="rgba(0,0,0,0)")
    
    fig_spatial.update_layout(
        xaxis=dict(range=[-0.05, 1.05]),
        yaxis=dict(range=[-0.05, 1.05], scaleanchor="x", scaleratio=1)
    )
    fig_spatial.write_html(output_dir / "spatial_distribution.html")
    
    # 4. Length vs. distance from center
    fig_center = px.scatter(
        df, x='center_dist', y='total_length', color='size',
        facet_row='pond_type', trendline='ols',
        hover_data=['image_name'],
        title='Length vs. Distance from Center',
        labels={'center_dist': 'Distance from Image Center', 
                'total_length': 'Total Length (mm)',
                'size': 'Size Category'}
    )
    fig_center.write_html(output_dir / "center_distance.html")
    
    # 5. Heatmap of measurements across the image (2D histogram)
    # Circle pond
    circle_df = df[df['pond_type'] == 'Circle']
    fig_heat_circle = px.density_heatmap(
        circle_df, x='norm_x', y='norm_y', z='total_length',
        histfunc='avg', nbinsx=10, nbinsy=10,
        title='Average Total Length by Position (Circle Pond)',
        labels={'norm_x': 'Normalized X Position', 
                'norm_y': 'Normalized Y Position',
                'total_length': 'Avg Total Length (mm)'}
    )
    fig_heat_circle.write_html(output_dir / "heatmap_circle.html")
    
    # Square pond
    square_df = df[df['pond_type'] == 'Square']
    fig_heat_square = px.density_heatmap(
        square_df, x='norm_x', y='norm_y', z='total_length',
        histfunc='avg', nbinsx=10, nbinsy=10,
        title='Average Total Length by Position (Square Pond)',
        labels={'norm_x': 'Normalized X Position', 
                'norm_y': 'Normalized Y Position',
                'total_length': 'Avg Total Length (mm)'}
    )
    fig_heat_square.write_html(output_dir / "heatmap_square.html")
    
    # 6. Correlation matrix
    corr_columns = ['total_length', 'carapace_length', 'length_ratio', 
                    'norm_x', 'norm_y', 'center_dist']
    corr = df[corr_columns].corr()
    
    fig_corr = px.imshow(
        corr, text_auto=True, color_continuous_scale='RdBu_r', 
        title='Correlation Matrix',
        labels={'x': 'Variable', 'y': 'Variable', 'color': 'Correlation'}
    )
    fig_corr.write_html(output_dir / "correlation.html")
    
    # 7. Summary dashboard with key findings
    summary_fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Size Distribution by Pond Type', 
            'Length Ratio by Pond Type',
            'Total Length Distribution', 
            'Spatial Distribution'
        )
    )
    
    # Size counts
    size_counts = df.groupby(['pond_type', 'size']).size().reset_index(name='count')
    summary_fig.add_trace(
        go.Bar(
            x=size_counts['pond_type'], y=size_counts['count'], 
            marker_color=['royalblue' if s == 'BIG' else 'firebrick' for s in size_counts['size']],
            text=size_counts['size'],
            name=''
        ),
        row=1, col=1
    )
    
    # Length ratios
    for i, size in enumerate(['BIG', 'SMALL']):
        color = 'royalblue' if size == 'BIG' else 'firebrick'
        size_df = df[df['size'] == size]
        summary_fig.add_trace(
            go.Box(
                x=size_df['pond_type'], y=size_df['length_ratio'],
                name=size, marker_color=color
            ),
            row=1, col=2
        )
    
    # Length distributions
    for i, (pond, color) in enumerate(zip(['Circle', 'Square'], ['royalblue', 'firebrick'])):
        pond_df = df[df['pond_type'] == pond]
        summary_fig.add_trace(
            go.Histogram(
                x=pond_df['total_length'], name=pond, marker_color=color, opacity=0.7
            ),
            row=2, col=1
        )
    
    # Spatial distribution (simplified)
    for size, color in zip(['BIG', 'SMALL'], ['royalblue', 'firebrick']):
        size_df = df[df['size'] == size]
        summary_fig.add_trace(
            go.Scatter(
                x=size_df['norm_x'], y=size_df['norm_y'],
                mode='markers', marker_color=color, name=size,
                marker=dict(size=5)
            ),
            row=2, col=2
        )
    
    # Add reference frame for spatial plot
    summary_fig.add_shape(
        type="rect", x0=0, y0=0, x1=1, y1=1, 
        line=dict(color="Black", width=1),
        fillcolor="rgba(0,0,0,0)",
        row=2, col=2
    )
    
    summary_fig.update_layout(
        height=800, width=1000,
        title_text="Exuviae Measurement Analysis Summary",
        showlegend=False
    )
    summary_fig.update_xaxes(range=[-0.05, 1.05], row=2, col=2)
    summary_fig.update_yaxes(range=[-0.05, 1.05], row=2, col=2)
    
    summary_fig.write_html(output_dir / "summary_dashboard.html")
    
    print(f"\nVisualizations saved to {output_dir}")
    print("Open the HTML files in a browser to interact with the plots")


analyze_length_measurements()

Loaded 95 images from /Users/gilbenor/Documents/code projects/msc/counting_research_algorithms/runs/pose/predict57/length_analysis.csv

===== Basic Statistics =====
Total detections: 125
Big exuviae: 55
Small exuviae: 70

===== Statistics by Pond Type =====
                total_length                                       \
                       count        mean        std    min    max   
pond_type size                                                      
Circle    BIG             49  194.036735  10.974966  175.8  213.7   
          SMALL           35  152.982857  11.140136  127.6  173.6   
Square    BIG              6  192.216667  13.387519  175.6  209.4   
          SMALL           35  152.142857  11.944339  117.7  173.5   

                carapace_length           length_ratio            
                           mean       std         mean       std  
pond_type size                                                    
Circle    BIG         45.665306  3.142819     4.261868  0