In [None]:
!which python

In [None]:
import glob

import multiqc
import polars as pl

# Find raw data directories (MultiQC needs raw logs, not parquet)
test_data_root = "/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData"
fastqc_dirs = glob.glob(f"{test_data_root}/data/modules/fastqc/v*")[:2]
fastp_dirs = glob.glob(f"{test_data_root}/data/modules/fastp/*")[:1]

print(f"Found {len(fastqc_dirs)} FastQC and {len(fastp_dirs)} fastp directories")

In [None]:
df_single = pl.read_parquet(
    "/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData/multiqc_output_complete_v1_30_0/multiqc_data/BETA-multiqc.parquet"
)
# Drop NA columns for easier viewing
# Remove duplicates
# Options to view all rows
pl.Config.set_tbl_rows(100)
df_single.select(
    [
        pl.col("creation_date"),
        pl.col("sample"),
        pl.col("anchor"),
        pl.col("type"),
    ]
).filter(pl.col("anchor") == "general_stats_table").drop_nulls().unique().sort("sample")

In [None]:
df_combined[[]]

In [None]:
# Examine parquet structure
print("FastQC columns:", df_fastqc.columns)
print("FastQC shape:", df_fastqc.shape)
print("\nFastp columns:", df_fastp.columns)
print("Fastp shape:", df_fastp.shape)

# Check unique types
print("\nFastQC unique types:", df_fastqc["type"].unique().to_list())
print("Fastp unique types:", df_fastp["type"].unique().to_list())

In [None]:
# Try parse_logs on original data directories
import os

# First, let's try parsing the original fastqc data
fastqc_dir = (
    "/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData/multiqc_output_fastqc_v1_31_0"
)
fastp_dir = (
    "/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData/multiqc_output_fastp_v1_31_0"
)

# Check if there are log files in these directories
print("FastQC dir contents:", os.listdir(fastqc_dir)[:5])
print("\nFastp dir contents:", os.listdir(fastp_dir)[:5])

In [None]:
# Look for raw data to parse - MultiQC needs raw log files, not parquet
# Let's search for the actual test data
test_data_root = "/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData"

# Find directories with raw data
import glob

fastqc_raw = glob.glob(f"{test_data_root}/data/modules/fastqc/v*")[:2]
fastp_raw = glob.glob(f"{test_data_root}/data/modules/fastp/*")[:2]

print("FastQC raw data dirs:", fastqc_raw)
print("Fastp raw data dirs:", fastp_raw)

In [None]:
# Parse multiple directories with raw data
multiqc.reset()  # Reset the state

# Parse FastQC data
if fastqc_raw:
    multiqc.parse_logs(fastqc_raw[0])
    print(f"Parsed {fastqc_raw[0]}")
    print("Samples:", multiqc.list_samples())
    print("Modules:", multiqc.list_modules())

# Now try to add fastp data to the same report
if fastp_raw:
    multiqc.parse_logs(fastp_raw[0])
    print(f"\nAdded {fastp_raw[0]}")
    print("Samples after adding fastp:", multiqc.list_samples())
    print("Modules after adding fastp:", multiqc.list_modules())

In [None]:
# Export combined data to parquet
# Get the report object to export
report = multiqc.get_report()

# Export to parquet
if report:
    output_file = "combined_multiqc_report.parquet"
    multiqc.write_parquet(output_file)
    print(f"Exported combined report to {output_file}")

    # Verify the structure
    df_combined_new = pl.read_parquet(output_file)
    print(f"\nNew combined parquet shape: {df_combined_new.shape}")
    print(
        f"Unique modules in combined: {df_combined_new.filter(pl.col('type') == 'run_metadata')['modules'].to_list()}"
    )

In [None]:
# Alternative approach: parse multiple directories at once
multiqc.reset()

# Collect all directories to parse
dirs_to_parse = []
if fastqc_raw:
    dirs_to_parse.extend(fastqc_raw[:2])
if fastp_raw:
    dirs_to_parse.extend(fastp_raw[:1])

print("Directories to parse:", dirs_to_parse)

# Parse all at once
if dirs_to_parse:
    multiqc.parse_logs(dirs_to_parse)

    print("\nAfter parsing all directories:")
    print(f"Samples: {len(multiqc.list_samples())} samples")
    print(f"Modules: {multiqc.list_modules()}")
    print(f"Available plots: {list(multiqc.list_plots().keys())[:5]}...")  # Show first 5 plots

## Summary: How to combine MultiQC reports

### ✅ Method 1: Parse multiple raw data directories 
```python
multiqc.parse_logs(fastqc_dir1, fastqc_dir2, fastp_dir1)
multiqc.write_report()
```

### ✅ Method 2: Load existing parquet files (YOUR USE CASE!)
```python
multiqc.reset()
multiqc.parse_logs("report1/multiqc.parquet")
multiqc.parse_logs("report2/multiqc.parquet") 
multiqc.write_report()  # Creates combined report
```

### Method 3: Manual parquet concatenation (not recommended)
```python
df1 = pl.read_parquet("report1/multiqc.parquet") 
df2 = pl.read_parquet("report2/multiqc.parquet")
pl.concat([df1, df2]).write_parquet("combined.parquet")
```

**Method 2 is perfect when you only have parquet files!** MultiQC can load them directly and maintain all functionality.

In [None]:
import multiqc

# Method 3: Load existing parquet files directly (YOUR USE CASE!)
multiqc.reset()

# Load individual parquet files
fastqc_parquet = "/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData/multiqc_output_complete_v1_30_0/multiqc_data/multiqc.parquet"
fastp_parquet = "/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData/multiqc_output_fastqc_v1_31_0_barcode01/multiqc_data/multiqc.parquet"

# Parse each parquet file
multiqc.parse_logs(fastqc_parquet)
print(f"After loading FastQC: {len(multiqc.list_samples())} samples")

# multiqc.parse_logs(fastp_parquet)
print(f"After adding fastp: {len(multiqc.list_samples())} samples total")
print(f"Combined modules: {multiqc.list_modules()}")
print(f"Available plots: {list(multiqc.list_plots().keys())}")
print(f"Available samples: {multiqc.list_samples()}")

In [None]:
# Export the combined parquet-based report
multiqc.write_report(output_dir=".", filename="combined_from_parquet.html")

# Check the new combined parquet
df_combined = pl.read_parquet("combined_from_parquet_data/multiqc.parquet")
print("\nCombined parquet from individual parquet files:")
print(f"Shape: {df_combined.shape}")
print(f"Unique samples: {len(df_combined['sample'].unique())}")

# Show difference between methods
print("\nComparison:")
print("- Raw logs method: 87 rows, 9 samples")
print(
    f"- Parquet files method: {df_combined.shape[0]} rows, {len(df_combined['sample'].unique())} samples"
)

In [None]:
# Plotly Express: Programmatically show/hide samples
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Create sample data
np.random.seed(42)
samples = ["Sample_A", "Sample_B", "Sample_C", "Sample_D", "Sample_E"]
data = []
for sample in samples:
    for i in range(50):
        data.append(
            {
                "x": i,
                "y": np.random.normal(0, 1) + np.sin(i / 5) + samples.index(sample),
                "sample": sample,
            }
        )
df = pd.DataFrame(data)

# Method 1: Set visibility on creation
fig = px.line(df, x="x", y="y", color="sample", title="Method 1: Initial visibility")

# Hide samples B and D initially
for trace in fig.data:
    if trace.name in ["Sample_B", "Sample_D"]:
        trace.visible = False  # or 'legendonly'

fig.show()

In [None]:
# Method 2: Update visibility after creation
fig2 = px.line(df, x="x", y="y", color="sample", title="Method 2: Update visibility")


# Function to show/hide samples
def toggle_samples(fig, samples_to_hide):
    fig.for_each_trace(
        lambda trace: trace.update(visible=False if trace.name in samples_to_hide else True)
    )
    return fig


# Hide Sample_C and Sample_E
fig2 = toggle_samples(fig2, ["Sample_C", "Sample_E"])
fig2.show()

In [None]:
# Method 3: Using buttons for interactive show/hide
fig3 = px.line(df, x="x", y="y", color="sample", title="Method 3: Interactive buttons")

# Create buttons to show/hide all samples
buttons = []
# Show all button
buttons.append(
    dict(label="Show All", method="restyle", args=[{"visible": True}, list(range(len(samples)))])
)

# Hide all button
buttons.append(
    dict(label="Hide All", method="restyle", args=[{"visible": False}, list(range(len(samples)))])
)

# Individual sample buttons
for i, sample in enumerate(samples):
    # Show only this sample
    visible_list = [False] * len(samples)
    visible_list[i] = True
    buttons.append(
        dict(
            label=f"Show only {sample}",
            method="restyle",
            args=[{"visible": visible_list}, list(range(len(samples)))],
        )
    )

fig3.update_layout(
    updatemenus=[
        dict(type="dropdown", direction="down", x=1.0, y=1.0, showactive=True, buttons=buttons)
    ]
)

fig3.show()

In [None]:
# Method 4: legendonly - samples appear in legend but not in plot
fig4 = px.line(df, x="x", y="y", color="sample", title="Method 4: Legend-only visibility")

# Set some samples to legendonly (visible in legend, hidden in plot)
for trace in fig4.data:
    if trace.name in ["Sample_A", "Sample_C"]:
        trace.visible = "legendonly"  # Click legend to show/hide

fig4.show()

print("Summary of visibility options:")
print("- visible=True: Show trace")
print("- visible=False: Hide trace completely")
print("- visible='legendonly': Show in legend, hidden in plot (click to toggle)")

In [None]:
# Test programmatic show/hide on MultiQC plots
print("Current MultiQC samples:", multiqc.list_samples())
print("Available plots:", list(multiqc.list_plots().keys()))

# Get a MultiQC plot
try:
    # Try fastqc plot first
    if "fastqc" in multiqc.list_plots():
        fastqc_plots = multiqc.list_plots()["fastqc"]
        print(f"FastQC plots available: {fastqc_plots[:3]}...")

        # Get a specific plot
        plot = multiqc.get_plot("fastqc", "Sequence Counts")
        print(f"Got plot type: {type(plot)}")

        # Show the original plot
        plot.show()

except Exception as e:
    print(f"Error getting plot: {e}")
    print("Let's check what's in the current MultiQC state:")
    print(f"Samples: {multiqc.list_samples()}")
    print(f"Modules: {multiqc.list_modules()}")

In [None]:
import multiqc

# Test programmatic show/hide on MultiQC FastQC plots
multiqc.reset()

# Load FastQC data
fastqc_parquet = "/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData/multiqc_output_fastqc_v1_31_0/multiqc_data/multiqc.parquet"
multiqc.parse_logs(fastqc_parquet)

print("FastQC samples:", multiqc.list_samples()[:5])  # First 5 samples
print("Available FastQC plots:", multiqc.list_plots()["fastqc"][:3])

# Get a FastQC plot and access its figure
plot = multiqc.get_plot("fastqc", "Sequence Counts")
fig = plot.get_figure(dataset_id="")  # Use empty string for default dataset

print(f"\nPlotly figure type: {type(fig)}")
print(f"Number of traces: {len(fig.data)}")

# Show trace names (these are the data series, not individual samples in this plot type)
for i, trace in enumerate(fig.data):
    print(f"  Trace {i}: {trace.name}")

# Show original plot
fig.show()

In [None]:
# Method 1: Hide specific data series in FastQC plot

plot = multiqc.get_plot("fastqc", "Sequence Counts")
fig = plot.get_figure(dataset_id="")

print("Original data series in plot:")
for trace in fig.data:
    print(f"  - {trace.name}")

# Hide specific data series (not samples, but data types)
series_to_hide = ["Duplicate Reads", "Unique Reads"]  # Hide these series
print(f"\nHiding data series: {series_to_hide}")

for trace in fig.data:
    if trace.name in series_to_hide:
        trace.visible = False
        print(f"  Hidden: {trace.name}")
    else:
        trace.visible = True
        print(f"  Visible: {trace.name}")

fig.update_layout(title="FastQC Plot - Some data series hidden")
fig.show()

print("\n✅ Note: This plot shows data series (Total/Unique/Duplicate), not individual samples")

In [None]:
# Alternative: Create new figure without modifying original

plot = multiqc.get_plot("fastqc", "Sequence Counts")
original_fig = plot.get_figure(dataset_id="")

# Create new figure with selective traces
new_fig = go.Figure()

series_to_show = ["Total Sequences"]  # Only show this series
print(f"Creating new figure showing only: {series_to_show}")

for trace in original_fig.data:
    if trace.name in series_to_show:
        new_fig.add_trace(trace)
        print(f"  Added: {trace.name}")

# Copy layout from original
new_fig.update_layout(original_fig.layout)
new_fig.update_layout(title="FastQC Plot - Only Total Sequences shown")

new_fig.show()
print("\n✅ Created new figure without modifying original")

In [None]:
# Method 2: Try a different FastQC plot that shows individual samples
plot = multiqc.get_plot("fastqc", "Per Sequence Quality Scores")
fig = plot.get_figure(dataset_id="")

print("Per Sequence Quality Scores plot:")
print(f"Number of traces: {len(fig.data)}")

# Check if this plot has individual sample traces
sample_traces = []
for i, trace in enumerate(fig.data[:10]):  # Check first 10 traces
    name = getattr(trace, "name", f"Trace_{i}")
    sample_traces.append(name)
    print(f"  Trace {i}: {name}")

# If we have sample-level traces, demonstrate hiding specific samples
if len(fig.data) > 3:  # More than just summary traces
    print(f"\n✅ This plot has {len(fig.data)} traces - likely individual samples!")

    # Hide first 5 samples
    samples_to_hide = sample_traces[:5]
    print(f"Hiding first 5 samples: {samples_to_hide}")

    for trace in fig.data[5:]:
        trace.visible = False

    fig.update_layout(title="FastQC Per-Sample Plot - First 5 samples hidden")
    fig.show()
else:
    print("This plot also shows data series, not individual samples")
    fig.show()

In [None]:
# Parse raw log files (not parquet)
multiqc.reset()
multiqc.parse_logs(*(fastqc_dirs + fastp_dirs))

print(f"Parsed {len(multiqc.list_samples())} samples")
print(f"Found modules: {multiqc.list_modules()}")

In [None]:
# List all samples
samples = multiqc.list_samples()
print(f"Samples ({len(samples)}): {samples}")

In [None]:
# List modules that were found
modules = multiqc.list_modules()
print(f"Modules: {modules}")

In [None]:
# List available plots
plots = multiqc.list_plots()
print(f"Available plots ({len(plots)}):")
for module, plot_list in plots.items():
    print(f"  {module}: {plot_list[:3]}...")  # Show first 3 plots per module

In [None]:
# Write the combined report (creates parquet automatically)
multiqc.write_report(output_dir=".", filename="combined_report.html")

# The parquet file is created at combined_report_data/multiqc.parquet
df = pl.read_parquet("combined_report_data/multiqc.parquet")
print(f"Parquet shape: {df.shape}")
print(f"Columns: {df.columns[:5]}...")
print("\nUnique types in parquet:")
for t in df["type"].unique().to_list():
    print(f"  - {t}: {len(df.filter(pl.col('type') == t))} rows")

In [None]:
multiqc.get_plot("fastqc", "Per Base N Content").show()

In [None]:
multiqc.get_plot("fastqc", "Per Sequence GC Content").show(dataset_id="Counts")

In [None]:
multiqc.get_plot("fastqc", "Sequence Counts").show()