In [None]:
!which python

In [None]:
import glob

import multiqc
import polars as pl

# Find raw data directories (MultiQC needs raw logs, not parquet)
test_data_root = "/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData"
fastqc_dirs = glob.glob(f"{test_data_root}/data/modules/fastqc/v*")[:2]
fastp_dirs = glob.glob(f"{test_data_root}/data/modules/fastp/*")[:1]

print(f"Found {len(fastqc_dirs)} FastQC and {len(fastp_dirs)} fastp directories")

In [None]:
df_single = pl.read_parquet(
    "/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData/multiqc_output_fastp_v1_31_0/multiqc_data/multiqc.parquet"
)
# Drop NA columns for easier viewing
# Remove duplicates
# Options to view all rows
pl.Config.set_tbl_rows(100)
df_single.filter(pl.col("anchor") == "general_stats_table").sort("sample")

In [None]:
df_combined[[]]

In [None]:
import multiqc

# Method 3: Load existing parquet files directly (YOUR USE CASE!)
multiqc.reset()

# Load individual parquet files
fastqc_parquet = "/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData/multiqc_output_fastqc_v1_31_0_single/multiqc_data/multiqc.parquet"
fastp_parquet = "/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData/multiqc_output_fastqc_v1_31_0_barcode01/multiqc_data/multiqc.parquet"

# Parse each parquet file
multiqc.parse_logs(fastqc_parquet)
print(f"After loading FastQC: {len(multiqc.list_samples())} samples")

multiqc.parse_logs(fastp_parquet)
print(f"After adding fastp: {len(multiqc.list_samples())} samples total")
print(f"Combined modules: {multiqc.list_modules()}")
print(f"Available plots: {list(multiqc.list_plots().keys())}")
print(f"Available samples: {multiqc.list_samples()}")

In [None]:
# Plotly Express: Programmatically show/hide samples
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Create sample data
np.random.seed(42)
samples = ["Sample_A", "Sample_B", "Sample_C", "Sample_D", "Sample_E"]
data = []
for sample in samples:
    for i in range(50):
        data.append(
            {
                "x": i,
                "y": np.random.normal(0, 1) + np.sin(i / 5) + samples.index(sample),
                "sample": sample,
            }
        )
df = pd.DataFrame(data)

# Method 1: Set visibility on creation
fig = px.line(df, x="x", y="y", color="sample", title="Method 1: Initial visibility")

# Hide samples B and D initially
for trace in fig.data:
    if trace.name in ["Sample_B", "Sample_D"]:
        trace.visible = False  # or 'legendonly'

fig.show()

In [None]:
# Method 2: Update visibility after creation
fig2 = px.line(df, x="x", y="y", color="sample", title="Method 2: Update visibility")


# Function to show/hide samples
def toggle_samples(fig, samples_to_hide):
    fig.for_each_trace(
        lambda trace: trace.update(visible=False if trace.name in samples_to_hide else True)
    )
    return fig


# Hide Sample_C and Sample_E
fig2 = toggle_samples(fig2, ["Sample_C", "Sample_E"])
fig2.show()

In [None]:
# Method 3: Using buttons for interactive show/hide
fig3 = px.line(df, x="x", y="y", color="sample", title="Method 3: Interactive buttons")

# Create buttons to show/hide all samples
buttons = []
# Show all button
buttons.append(
    dict(label="Show All", method="restyle", args=[{"visible": True}, list(range(len(samples)))])
)

# Hide all button
buttons.append(
    dict(label="Hide All", method="restyle", args=[{"visible": False}, list(range(len(samples)))])
)

# Individual sample buttons
for i, sample in enumerate(samples):
    # Show only this sample
    visible_list = [False] * len(samples)
    visible_list[i] = True
    buttons.append(
        dict(
            label=f"Show only {sample}",
            method="restyle",
            args=[{"visible": visible_list}, list(range(len(samples)))],
        )
    )

fig3.update_layout(
    updatemenus=[
        dict(type="dropdown", direction="down", x=1.0, y=1.0, showactive=True, buttons=buttons)
    ]
)

fig3.show()

In [None]:
# Method 4: legendonly - samples appear in legend but not in plot
fig4 = px.line(df, x="x", y="y", color="sample", title="Method 4: Legend-only visibility")

# Set some samples to legendonly (visible in legend, hidden in plot)
for trace in fig4.data:
    if trace.name in ["Sample_A", "Sample_C"]:
        trace.visible = "legendonly"  # Click legend to show/hide

fig4.show()

print("Summary of visibility options:")
print("- visible=True: Show trace")
print("- visible=False: Hide trace completely")
print("- visible='legendonly': Show in legend, hidden in plot (click to toggle)")

In [None]:
# Test programmatic show/hide on MultiQC plots
print("Current MultiQC samples:", multiqc.list_samples())
print("Available plots:", list(multiqc.list_plots().keys()))

# Get a MultiQC plot
try:
    # Try fastqc plot first
    if "fastqc" in multiqc.list_plots():
        fastqc_plots = multiqc.list_plots()["fastqc"]
        print(f"FastQC plots available: {fastqc_plots[:3]}...")

        # Get a specific plot
        plot = multiqc.get_plot("fastqc", "Sequence Counts")
        print(f"Got plot type: {type(plot)}")

        # Show the original plot
        plot.show()

except Exception as e:
    print(f"Error getting plot: {e}")
    print("Let's check what's in the current MultiQC state:")
    print(f"Samples: {multiqc.list_samples()}")
    print(f"Modules: {multiqc.list_modules()}")

In [None]:
# import multiqc

# # Test programmatic show/hide on MultiQC FastQC plots
# multiqc.reset()

# # Load FastQC data
# fastqc_parquet = "/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData/multiqc_output_fastqc_v1_31_0/multiqc_data/multiqc.parquet"
# multiqc.parse_logs(fastqc_parquet)

# print("FastQC samples:", multiqc.list_samples()[:5])  # First 5 samples
# print("Available FastQC plots:", multiqc.list_plots()["fastqc"][:3])

# Get a FastQC plot and access its figure
plot = multiqc.get_plot("fastqc", "Sequence Counts")
fig = plot.get_figure(dataset_id="")  # Use empty string for default dataset

print(f"\nPlotly figure type: {type(fig)}")
print(f"Number of traces: {len(fig.data)}")

# Show trace names (these are the data series, not individual samples in this plot type)
for i, trace in enumerate(fig.data):
    print(f"  Trace {i}: {trace.name}")

# Show original plot
fig.show()

In [None]:
# Method 1: Hide specific data series in FastQC plot

plot = multiqc.get_plot("fastqc", "Sequence Counts")
fig = plot.get_figure(dataset_id="")

print("Original data series in plot:")
for trace in fig.data:
    print(f"  - {trace.name}")

# Hide specific data series (not samples, but data types)
series_to_hide = ["Duplicate Reads", "Unique Reads"]  # Hide these series
print(f"\nHiding data series: {series_to_hide}")

for trace in fig.data:
    if trace.name in series_to_hide:
        trace.visible = False
        print(f"  Hidden: {trace.name}")
    else:
        trace.visible = True
        print(f"  Visible: {trace.name}")

fig.update_layout(title="FastQC Plot - Some data series hidden")
fig.show()

print("\n✅ Note: This plot shows data series (Total/Unique/Duplicate), not individual samples")

In [None]:
# Method 2: Try a different FastQC plot that shows individual samples
plot = multiqc.get_plot("fastqc", "Per Sequence Quality Scores")
fig = plot.get_figure(dataset_id="")

print("Per Sequence Quality Scores plot:")
print(f"Number of traces: {len(fig.data)}")

# Check if this plot has individual sample traces
sample_traces = []
for i, trace in enumerate(fig.data[:10]):  # Check first 10 traces
    name = getattr(trace, "name", f"Trace_{i}")
    sample_traces.append(name)
    print(f"  Trace {i}: {name}")

# If we have sample-level traces, demonstrate hiding specific samples
if len(fig.data) > 3:  # More than just summary traces
    print(f"\n✅ This plot has {len(fig.data)} traces - likely individual samples!")

    # Hide first 5 samples
    samples_to_hide = sample_traces[:5]
    print(f"Hiding first 5 samples: {samples_to_hide}")

    for trace in fig.data[5:]:
        trace.visible = False

    fig.update_layout(title="FastQC Per-Sample Plot - First 5 samples hidden")
    fig.show()
else:
    print("This plot also shows data series, not individual samples")
    fig.show()

In [None]:
# Option 5 FIXED: Logo outside plot area with proper margins
def add_header_logo_fixed(fig, logo_source, logo_size_px=50):
    """Add logo outside plot area - FIXED VERSION"""
    
    # Get figure dimensions
    width = fig.layout.width or 700
    height = fig.layout.height or 450
    
    # Calculate logo size
    sizex = logo_size_px / width
    sizey = logo_size_px / height
    
    # IMPORTANT: Need to adjust layout margins to make space for external logo
    fig.update_layout(
        margin=dict(l=80, r=50, t=80, b=50)  # Left and top margins for logo space
    )
    
    # Position outside plot area (now possible with margins)
    fig.add_layout_image(
        dict(
            source=logo_source,
            xref="paper", yref="paper",
            x=-0.08, y=1.06,  # Outside top-left 
            sizex=sizex, sizey=sizey,
            xanchor="left", yanchor="bottom",
            opacity=0.8,
            layer="above"
        )
    )
    
    return fig

# Test the FIXED version
print("Testing FIXED header logo (with margins):")
plot = multiqc.get_plot("fastqc", "Sequence Counts")
fig = plot.get_figure(dataset_id="")

fig = add_header_logo_fixed(fig, logo_base64, logo_size_px=40)
fig.update_layout(title="FIXED: Logo outside plot area")
fig.show()

print("✅ Logo should now be visible outside the plot area!")

In [None]:
multiqc.list_plots()

In [None]:
# Alternative: Logo just inside plot area (simpler, no margin changes needed)
def add_corner_logo_simple(fig, logo_source, logo_size_px=45):
    """Simple corner logo - just inside plot area"""
    
    width = fig.layout.width or 700
    height = fig.layout.height or 450
    sizex = logo_size_px / width
    sizey = logo_size_px / height
    
    fig.add_layout_image(
        dict(
            source=logo_source,
            xref="paper", yref="paper",
            y=0.95, x=0.95,  # Just inside top-left corner
            sizex=sizex, sizey=sizey,
            xanchor="left", yanchor="top",
            opacity=0.6,
            layer="above"
        )
    )
    return fig

# Test simple corner approach
plot = multiqc.get_plot("fastqc", "Per Base Sequence Content") 
fig = plot.get_figure(dataset_id="Counts")
fig = add_corner_logo_simple(fig, logo_base64, logo_size_px=80)
fig.update_layout(title="Simple: Logo in corner (no margin changes)")
fig.show()

print("✅ Simple corner approach - always works!")

In [None]:
multiqc.list_tables()

In [None]:
  # Test MultiQC violin plot functionality
  import multiqc
  import plotly.graph_objects as go

  # Load FastQC data
  multiqc.reset()
  fastqc_parquet ="/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData/multiqc_output_fastqc_v1_31_0/multiqc_data/multiqc.parquet"
  multiqc.parse_logs(fastqc_parquet)

  print("Available FastQC plots:")
  plots = multiqc.list_plots()["fastqc"]
  for i, plot in enumerate(plots):
      print(f"  {i}: {plot}")

  # Get the violin plot
  print("\n=== Getting MultiQC Violin Plot ===")
  violin_plot = multiqc.get_plot("fastqc", "Top overrepresented sequences")
  print(f"Violin plot type: {type(violin_plot)}")

  # Get the figure and show it
  fig = violin_plot.get_figure(dataset_id="")
  print(f"Figure type: {type(fig)}")
  print(f"Number of traces: {len(fig.data)}")

  # Show the plot
  fig.show()

In [None]:
general_stats_df

In [None]:
# Convert to violin plot using Plotly directly
import plotly.express as px
import plotly.graph_objects as go

# Get the general stats data
general_stats_df = df_single.filter(pl.col("anchor") == "general_stats_table").sort("sample").to_pandas()
general_stats_df = general_stats_df.dropna(subset=["dt_anchor"])
print(general_stats_df.columns)

# Create violin plot
fig = px.violin(general_stats_df, y='metric', x='val_raw', box=True,
                title="General Statistics Distribution")
fig.show()

In [None]:
# Create MultiQC-style violin plot from general stats data - EXACT MATCH
import plotly.graph_objects as go
import json

# Get the general stats data
general_stats_df = df_single.filter(pl.col("anchor") == "general_stats_table").sort("sample").to_pandas()
general_stats_df = general_stats_df.dropna(subset=["dt_anchor"])

# Get unique metrics
metrics = general_stats_df['metric'].unique()
print(f"Found metrics: {metrics}")

# Constants from MultiQC
VIOLIN_HEIGHT = 70  # Exact from MultiQC
EXTRA_HEIGHT = 63  # Exact from MultiQC

# Create layout - EXACT MultiQC styling
layout = go.Layout(
    title="General Statistics",
    showlegend=False,
    template="plotly_white",  # Clean white background
    margin=dict(pad=0, b=40, t=50, l=100, r=20),  # MultiQC margins
    grid={
        'rows': len(metrics),
        'columns': 1,
        'roworder': 'top to bottom',
        'pattern': 'independent',
        'ygap': 0.4,  # Reduced padding between violins
        'subplots': [[(f"x{i + 1}y{i + 1}" if i > 0 else "xy")] for i in range(len(metrics))]
    },
    height=VIOLIN_HEIGHT * len(metrics) + EXTRA_HEIGHT,
    xaxis={
        'automargin': False,  # MultiQC specific
        'tickfont': {'size': 9, 'color': 'rgba(0,0,0,0.5)'},  # Grey small font
        'gridcolor': 'rgba(0,0,0,0.1)',
        'zerolinecolor': 'rgba(0,0,0,0.1)',
    },
    yaxis={
        'tickfont': {'size': 9, 'color': 'rgba(0,0,0,0.5)'},  # Grey small font
        'gridcolor': 'rgba(0,0,0,0.1)',
        'zerolinecolor': 'rgba(0,0,0,0.1)',
    },
    violingap=0,  # No gap between violins
)

# Create figure
fig = go.Figure(layout=layout)

# Process each metric
for metric_idx, metric in enumerate(metrics):
    # Get values for this metric
    metric_data = general_stats_df[general_stats_df['metric'] == metric]
    values = metric_data['val_raw'].tolist()
    samples = metric_data['sample'].tolist()

    # Get metric title and check if it's percentage
    metric_title = metric
    is_percentage = False
    try:
        column_meta = json.loads(metric_data.iloc[0]['column_meta'])
        metric_title = column_meta.get('title', metric)
        # Check if it's a percentage metric
        if 'percent' in metric.lower() or column_meta.get('suffix') == '%':
            is_percentage = True
    except:
        pass

    # Set up axes for this metric
    axis_key = "" if metric_idx == 0 else str(metric_idx + 1)

    # Configure X-axis with proper limits
    x_axis_config = {
        'automargin': False,
        'tickfont': {'size': 9, 'color': 'rgba(0,0,0,0.5)'},
        'gridcolor': 'rgba(0,0,0,0.1)',
        'zerolinecolor': 'rgba(0,0,0,0.1)',
        'title': "",
        'hoverformat': '.2f'
    }

    # Set range based on metric type
    if is_percentage:
        x_axis_config['range'] = [0, 100]  # 0-100% for percentage distributions
        x_axis_config['ticksuffix'] = '%'
    else:
        # Set min to 0 for non-percentage metrics
        min_val = min(0, min(values))
        max_val = max(values)
        # Add 5% padding
        padding = (max_val - min_val) * 0.05
        x_axis_config['range'] = [min_val - padding, max_val + padding]

    layout[f"xaxis{metric_idx + 1}"] = x_axis_config

    # Configure Y-axis with metric label
    padding = "  "  # MultiQC style padding
    layout[f"yaxis{metric_idx + 1}"] = {
        'automargin': True,
        'tickfont': {'size': 9, 'color': 'rgba(0,0,0,0.5)'},  # Grey small font
        'gridcolor': 'rgba(0,0,0,0.1)',
        'zerolinecolor': 'rgba(0,0,0,0.1)',
        'tickmode': 'array',
        'tickvals': [metric_idx],
        'ticktext': [f"{padding}{metric_title}{padding}"],
        'range': [metric_idx - 0.4, metric_idx + 0.4]
    }

    # Add violin trace - grey color as per MultiQC
    fig.add_trace(
        go.Violin(
            x=values,
            y=[metric_idx] * len(values),
            name=metric_title,
            text=samples,
            xaxis=f"x{axis_key}",
            yaxis=f"y{axis_key}",
            orientation='h',
            side='both',
            box={'visible': True},  # Show box plot
            meanline={'visible': True},  # Show mean line
            fillcolor='#b5b5b5',  # Grey color - EXACT MultiQC
            line={'width': 2, 'color': '#b5b5b5'},  # Grey line
            opacity=0.5,
            points=False,  # Don't show points from violin
            hoveron='points',  # Only hover on scatter points
            showlegend=False
        )
    )

    # Add scatter points - blue color as per MultiQC when all violins are grey
    fig.add_trace(
        go.Scatter(
            x=values,
            y=[metric_idx] * len(values),
            mode='markers',
            text=samples,
            xaxis=f"x{axis_key}",
            yaxis=f"y{axis_key}",
            marker={
                'size': 4,
                'color': '#0b79e6',  # Blue - EXACT MultiQC color when violins are grey
            },
            showlegend=False,
            hovertemplate='<b>%{text}</b><br>%{x}<extra></extra>',
            hoverlabel={'bgcolor': 'white'}
        )
    )

# Update final layout
fig.update_layout(layout)

# Show the plot
fig.show()

print(f"✅ Created exact MultiQC-style violin plot with {len(metrics)} metrics and {len(general_stats_df['sample'].unique())} samples")
print("✅ Styling: plotly_white template, grey violins (#b5b5b5), blue points (#0b79e6), small grey fonts")
print("✅ Ranges: 0-100% for percentages, 0-based for other metrics")
