In [None]:
# RECOMMENDED: Export sample list using general_stats_table filter
if 'sample' in df.columns and 'anchor' in df.columns:
    print("=== RECOMMENDED APPROACH: Filter by general_stats_table ===")
    
    # Apply the filtering used in the dashboard
    sample_series = (
        df.filter(pl.col("anchor") == "general_stats_table")
        .select("sample")
        .drop_nulls()
        .unique()
        .sort("sample")
    )
    
    # Filter out None, "null" strings, and empty strings
    clean_samples = [
        s for s in sample_series["sample"].to_list() 
        if s is not None and s != "null" and s != ""
    ]
    
    print(f"✅ Filtered samples from general_stats_table: {len(clean_samples)}")
    print("\nAll samples:")
    for i, sample in enumerate(clean_samples, 1):
        print(f"{i:2d}. '{sample}'")
    
    print("\n=== COMPARISON WITH UNFILTERED ===")
    # Compare with all samples approach
    all_samples = [s for s in df['sample'].unique().sort() if s is not None and s != "null" and s != ""]
    print(f"All samples (any anchor): {len(all_samples)}")
    print(f"General stats only: {len(clean_samples)}")
    print(f"Samples filtered out: {len(all_samples) - len(clean_samples)}")
    
    print("\n=== PYTHON FORMAT FOR DASHBOARD ===")
    print("# Use this in your dashboard code:")
    print(f"AVAILABLE_SAMPLES = {clean_samples}")
    
    # Save to JSON
    import json
    with open('multiqc_samples_general_stats.json', 'w') as f:
        json.dump(clean_samples, f, indent=2)
    
    print(f"\n📁 Saved to 'multiqc_samples_general_stats.json'")
else:
    print("❌ Required columns 'sample' or 'anchor' not found!")

# MultiQC Parquet Data Exploration

This notebook explores the MultiQC parquet file to understand the data structure and extract useful information for the dashboard automation.

## Setup and Data Loading

In [None]:
import polars as pl
import pandas as pd
from pathlib import Path

# File path
parquet_path = "/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData/multiqc_output_fastqc_v1_30_0/multiqc_data/BETA-multiqc.parquet"

# Load with Polars (fast)
df = pl.read_parquet(parquet_path)

print(f"✅ Loaded parquet file: {Path(parquet_path).name}")
print(f"📊 Shape: {df.shape}")
print(f"🗂️  Columns: {len(df.columns)}")

## Data Structure Overview

In [None]:
# Basic info
print("=== COLUMN NAMES ===")
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

print(f"\n=== DATA TYPES ===")
for col, dtype in zip(df.columns, df.dtypes):
    print(f"{col:<20} {dtype}")

In [None]:
# First few rows
print("=== FIRST 5 ROWS ===")
df.head(5)

## Sample Analysis

In [None]:
# Sample column analysis
if 'sample' in df.columns:
    print("=== SAMPLE COLUMN ANALYSIS ===")
    
    # Unique samples
    unique_samples = df.select('sample').unique().sort('sample')
    sample_list = unique_samples['sample'].to_list()
    
    print(f"Total unique samples: {len(sample_list)}")
    print(f"Samples with None values: {sample_list.count(None)}")
    
    # Filter out None values
    valid_samples = [s for s in sample_list if s is not None]
    print(f"Valid samples (no None): {len(valid_samples)}")
    
    print("\n=== FIRST 10 VALID SAMPLES ===")
    for i, sample in enumerate(valid_samples[:10], 1):
        print(f"{i:2d}. {sample}")
    
    print("\n=== SAMPLE LENGTH DISTRIBUTION ===")
    sample_lengths = [len(str(s)) for s in valid_samples]
    print(f"Min length: {min(sample_lengths)}")
    print(f"Max length: {max(sample_lengths)}")
    print(f"Average length: {sum(sample_lengths) / len(sample_lengths):.1f}")
    
else:
    print("❌ No 'sample' column found!")

In [None]:
# Sample patterns analysis
if 'sample' in df.columns:
    print("=== SAMPLE PATTERN ANALYSIS ===")
    
    valid_samples = [s for s in df['sample'].unique() if s is not None]
    
    # Categorize samples by pattern
    numeric_samples = [s for s in valid_samples if s.isdigit()]
    alphanumeric_samples = [s for s in valid_samples if not s.isdigit() and '_' in s]
    dna_like_samples = [s for s in valid_samples if all(c in 'ATCG' for c in s) and len(s) > 20]
    other_samples = [s for s in valid_samples if s not in numeric_samples + alphanumeric_samples + dna_like_samples]
    
    print(f"Numeric samples: {len(numeric_samples)}")
    print(f"  Examples: {numeric_samples[:3]}")
    
    print(f"\nAlphanumeric samples (with underscore): {len(alphanumeric_samples)}")
    print(f"  Examples: {alphanumeric_samples[:3]}")
    
    print(f"\nDNA-like samples (long ATCG sequences): {len(dna_like_samples)}")
    print(f"  Examples: {[s[:30] + '...' for s in dna_like_samples[:3]]}")
    
    print(f"\nOther samples: {len(other_samples)}")
    print(f"  Examples: {[s[:30] + '...' if len(s) > 30 else s for s in other_samples[:3]]}")

## Data Content Exploration

In [None]:
# Explore key columns
key_columns = ['anchor', 'type', 'plot_type', 'section_key', 'metric']

for col in key_columns:
    if col in df.columns:
        unique_vals = df[col].unique().to_list()
        print(f"\n=== {col.upper()} (unique values: {len(unique_vals)}) ===")
        for val in unique_vals[:10]:
            print(f"  - {val}")
        if len(unique_vals) > 10:
            print(f"  ... and {len(unique_vals) - 10} more")

In [None]:
# Value columns analysis
value_columns = ['val_raw', 'val_mod', 'val_fmt']

for col in value_columns:
    if col in df.columns:
        print(f"\n=== {col.upper()} ===")
        print(f"Non-null count: {df[col].count()}")
        print(f"Null count: {df.shape[0] - df[col].count()}")
        
        # Sample values
        non_null_values = df.filter(pl.col(col).is_not_null())[col].unique().to_list()[:5]
        print(f"Sample values: {non_null_values}")

## Sample-Specific Data

In [None]:
# Filter data for our test sample
test_sample = '00050101'

if 'sample' in df.columns:
    sample_data = df.filter(pl.col('sample') == test_sample)
    
    print(f"=== DATA FOR SAMPLE: {test_sample} ===")
    print(f"Rows for this sample: {sample_data.shape[0]}")
    
    if sample_data.shape[0] > 0:
        print("\n=== METRICS FOR THIS SAMPLE ===")
        metrics = sample_data['metric'].unique().to_list()
        for metric in metrics[:10]:
            print(f"  - {metric}")
        
        print("\n=== SAMPLE DATA PREVIEW ===")
        sample_data.select(['anchor', 'metric', 'val_raw', 'val_fmt']).head(5)
    else:
        print(f"❌ No data found for sample {test_sample}")

## Export Sample List for Dashboard

In [None]:
# Export clean sample list for use in dashboard
if 'sample' in df.columns:
    # Get clean sample list (no None values)
    clean_samples = [s for s in df['sample'].unique().sort() if s is not None]
    
    print(f"=== FINAL SAMPLE LIST FOR DASHBOARD ===")
    print(f"Total samples: {len(clean_samples)}")
    print("\nFirst 10 samples:")
    for i, sample in enumerate(clean_samples[:10], 1):
        print(f"{i:2d}. '{sample}'")
    
    print("\n=== PYTHON LIST FORMAT ===")
    print("# Copy this for your dashboard:")
    print(f"SAMPLES = {clean_samples[:20]}  # First 20 samples")
    
    # Save to file
    import json
    with open('multiqc_samples.json', 'w') as f:
        json.dump(clean_samples, f, indent=2)
    
    print(f"\n✅ Saved {len(clean_samples)} samples to 'multiqc_samples.json'")

## Advanced Analysis

In [None]:
# Convert to Pandas for more detailed exploration if needed
df_pandas = df.to_pandas()

print("=== PANDAS DATAFRAME INFO ===")
print(df_pandas.info())

In [None]:
# Sample correlation with metrics
if 'sample' in df.columns and 'metric' in df.columns:
    print("=== SAMPLE-METRIC RELATIONSHIPS ===")
    
    # Cross-tabulation
    sample_metric_counts = df.group_by(['sample', 'metric']).count().sort('count', descending=True)
    
    print("Top sample-metric combinations:")
    sample_metric_counts.head(10)

In [None]:
# Summary statistics
print("=== SUMMARY STATISTICS ===")
print(f"📊 Total rows: {df.shape[0]:,}")
print(f"🗂️  Total columns: {df.shape[1]}")
print(f"🏷️  Unique samples: {df['sample'].n_unique() if 'sample' in df.columns else 'N/A'}")
print(f"📈 Unique metrics: {df['metric'].n_unique() if 'metric' in df.columns else 'N/A'}")
print(f"🎯 Unique plot types: {df['plot_type'].n_unique() if 'plot_type' in df.columns else 'N/A'}")

# Memory usage
memory_mb = df.estimated_size() / (1024 * 1024) if hasattr(df, 'estimated_size') else 'Unknown'
print(f"💾 Estimated memory: {memory_mb:.2f} MB" if isinstance(memory_mb, float) else f"💾 Memory: {memory_mb}")