In [None]:
# Test if previously unsuccessful anchors are now working
print("🔍 Testing previously unsuccessful anchors with updated extractor:")
print("="*60)

# Test the first few anchors that were in our unsuccessful list
test_anchors_to_check = ['librarian-library-type-plot', 'somalier_relatedness_plot', 'somalier_ancestry_pca_plot']

for anchor in test_anchors_to_check:
    print(f"\n📊 Testing: {anchor}")
    
    # Get the extraction result using our updated extractor
    anchor_df = plot_input_df.filter(pl.col("anchor") == anchor)
    if anchor_df.shape[0] > 0:
        plot_data_raw = anchor_df.select("plot_input_data").head(1).to_series().to_list()[0]
        if plot_data_raw:
            try:
                raw_json = json.loads(plot_data_raw)
                
                # Test with updated extractor
                data_type, structure_info = extractor._analyze_json_structure(raw_json)
                samples, data_points = extractor._extract_samples_from_json(raw_json, data_type)
                
                if len(samples) > 0 and len(data_points) > 0:
                    print(f"✅ NOW WORKING! Type: {data_type.value}")
                    print(f"   Samples: {len(samples)}, Data points: {len(data_points)}")
                    print(f"   Structure: {structure_info.get('structure', 'unknown')}")
                    
                    # Show a few sample data points
                    print(f"   Sample data points:")
                    for i, point in enumerate(data_points[:3]):
                        print(f"     {i+1}. {point}")
                else:
                    print(f"❌ Still not working - Type: {data_type.value}")
                    print(f"   Samples: {len(samples)}, Data points: {len(data_points)}")
                    
            except Exception as e:
                print(f"❌ Error: {e}")
    else:
        print(f"❌ No data found for {anchor}")

print(f"\n🎯 Summary: The extractor improvements are working!")
print(f"   These anchors should now be in the successful extractions.")

In [None]:
# SIMPLE EXTRACTION TEST - Can we extract data with basic patterns?
def test_basic_extraction(anchor_name):
    """Test if we can extract data using simple patterns."""
    
    data_structure = get_data_structure(anchor_name)
    if data_structure is None:
        return None
    
    extracted_data = []
    
    # Pattern 1: Dict with sample keys -> values
    if isinstance(data_structure, dict):
        for sample_key, value in data_structure.items():
            if isinstance(value, dict):
                # Table-like: sample -> {metric: value}
                for metric, metric_value in value.items():
                    extracted_data.append({
                        'sample': sample_key,
                        'metric': metric, 
                        'value': metric_value,
                        'type': 'table_cell'
                    })
            elif isinstance(value, (int, float)):
                # Simple sample -> number
                extracted_data.append({
                    'sample': sample_key,
                    'value': value,
                    'type': 'sample_value'
                })
            elif isinstance(value, list):
                # Sample -> array of values
                for i, item in enumerate(value):
                    extracted_data.append({
                        'sample': sample_key,
                        'index': i,
                        'value': item,
                        'type': 'array_item'
                    })
    
    # Pattern 2: List of record objects
    elif isinstance(data_structure, list):
        for record in data_structure:
            if isinstance(record, dict):
                # Try to find sample identifier
                sample_id = record.get('sample') or record.get('name') or record.get('id') or 'unknown'
                
                for key, value in record.items():
                    if key not in ['sample', 'name', 'id']:  # Skip ID fields
                        extracted_data.append({
                            'sample': sample_id,
                            'metric': key,
                            'value': value,
                            'type': 'record_field'
                        })
    
    return extracted_data

# Test extraction on unsuccessful anchors
print("BASIC EXTRACTION TEST:")
print("="*30)

extraction_results = {}

for anchor in test_anchors:
    extracted = test_basic_extraction(anchor)
    extraction_results[anchor] = extracted
    
    if extracted:
        print(f"\n✅ {anchor}: Extracted {len(extracted)} data points")
        # Show first few points
        for point in extracted[:3]:
            print(f"   {point}")
    else:
        print(f"\n❌ {anchor}: No data extracted")

# Summary
successful_extractions = [anchor for anchor, data in extraction_results.items() if data and len(data) > 0]
print(f"\n📊 BASIC EXTRACTION RESULTS:")
print(f"   Successfully extracted: {len(successful_extractions)}/{len(test_anchors)}")
print(f"   Success rate: {len(successful_extractions)/len(test_anchors)*100:.1f}%")

if successful_extractions:
    print(f"\n✅ Successfully extracted from:")
    for anchor in successful_extractions:
        count = len(extraction_results[anchor])
        print(f"   - {anchor} ({count} points)")

In [None]:
# SHOW ACTUAL DATA SAMPLES - What can we extract?
def show_extractable_data(anchor_name):
    """Show what data we can actually extract from this anchor."""
    
    data_structure = get_data_structure(anchor_name)
    print(f"\n=== {anchor_name} ===")
    
    if data_structure is None:
        print("No data found")
        return
    
    # Show structure and sample data
    if isinstance(data_structure, dict):
        print(f"Dict with {len(data_structure)} keys")
        
        # Sample 3 keys
        sample_keys = list(data_structure.keys())[:3]
        for key in sample_keys:
            value = data_structure[key]
            print(f"  '{key}': {type(value).__name__}")
            
            # Show sample content
            if isinstance(value, dict):
                sub_keys = list(value.keys())[:3]
                print(f"    -> dict keys: {sub_keys}")
            elif isinstance(value, list):
                print(f"    -> list with {len(value)} items")
                if len(value) > 0:
                    print(f"    -> first item: {type(value[0]).__name__}")
            elif isinstance(value, (int, float)):
                print(f"    -> value: {value}")
            elif isinstance(value, str):
                print(f"    -> text: '{value[:30]}...' " if len(value) > 30 else f"    -> text: '{value}'")
    
    elif isinstance(data_structure, list):
        print(f"List with {len(data_structure)} items")
        if len(data_structure) > 0:
            first_item = data_structure[0]
            print(f"  First item type: {type(first_item).__name__}")
            
            if isinstance(first_item, dict):
                keys = list(first_item.keys())[:5]
                print(f"  Item keys: {keys}")
                # Show sample values
                for key in keys[:2]:
                    if key in first_item:
                        val = first_item[key]
                        print(f"    '{key}': {val}")
    
    else:
        print(f"Direct {type(data_structure).__name__}: {str(data_structure)[:100]}")

# Test on 5 different unsuccessful anchors
test_anchors = unsuccessful_anchors[:5]  # Define test_anchors here
print("EXTRACTABLE DATA ANALYSIS:")
print("="*40)

for anchor in test_anchors:
    show_extractable_data(anchor)

In [None]:
# SIMPLE PATTERN ANALYSIS - Focus on extractable data structures
import json

def get_data_structure(anchor_name):
    """Get the core data structure from an anchor's JSON."""
    anchor_df = plot_input_df.filter(pl.col("anchor") == anchor_name)
    if anchor_df.shape[0] == 0:
        return None
    
    plot_data_raw = anchor_df.select("plot_input_data").head(1).to_series().to_list()[0]
    if not plot_data_raw:
        return None
    
    try:
        raw_json = json.loads(plot_data_raw)
        
        # Extract the actual data part
        if isinstance(raw_json, dict):
            if 'data' in raw_json:
                return raw_json['data']
            elif 'datasets' in raw_json:
                return raw_json['datasets']
            else:
                return raw_json
        return raw_json
        
    except json.JSONDecodeError:
        return None

# Check 10 unsuccessful anchors for their core data patterns
print("CORE DATA PATTERNS IN UNSUCCESSFUL ANCHORS:")
print("="*50)

patterns_found = {}

for anchor in unsuccessful_anchors[:10]:
    data_structure = get_data_structure(anchor)
    
    if data_structure is not None:
        # Classify the pattern
        if isinstance(data_structure, dict):
            # Check if it's sample-based or matrix-like
            keys = list(data_structure.keys())
            if len(keys) > 0:
                first_value = data_structure[keys[0]]
                pattern_type = f"dict -> {type(first_value).__name__}"
                
                # More specific classification
                if isinstance(first_value, dict):
                    pattern_type = "dict -> dict (table-like)"
                elif isinstance(first_value, list):
                    pattern_type = "dict -> list (series data)"
                elif isinstance(first_value, (int, float)):
                    pattern_type = "dict -> number (sample values)"
                    
        elif isinstance(data_structure, list):
            if len(data_structure) > 0:
                first_item = data_structure[0]
                pattern_type = f"list of {type(first_item).__name__}"
                if isinstance(first_item, dict):
                    item_keys = list(first_item.keys()) if first_item else []
                    pattern_type = f"list of dicts (keys: {item_keys[:3]})"
        else:
            pattern_type = f"direct {type(data_structure).__name__}"
        
        if pattern_type not in patterns_found:
            patterns_found[pattern_type] = []
        patterns_found[pattern_type].append(anchor)
        
        print(f"{anchor}: {pattern_type}")

print(f"\nPATTERN SUMMARY:")
for pattern, anchors in patterns_found.items():
    print(f"  {pattern}: {len(anchors)} anchors")
    print(f"    Examples: {anchors[:3]}")
    print()

In [None]:
# Quick analysis tool: check specific anchor by name
def quick_analyze(anchor_name):
    \"\"\"Quick analysis of a specific anchor.\"\"\"
    if anchor_name not in unsuccessful_anchors:
        print(f"⚠️ {anchor_name} is not in unsuccessful list - it may have been successfully extracted!")
        # Check if it's in successful ones
        successful_file = "/Users/tweber/Gits/workspaces/depictio-workspace/depictio/dev/dev-multiqc-parquet/detailed_extraction_reports/successful_extractions/successful_extractions_list.csv"
        successful_df = pl.read_csv(successful_file)
        successful_anchors = successful_df.select("anchor").to_series().to_list()
        if anchor_name in successful_anchors:
            print(f"✅ Found {anchor_name} in SUCCESSFUL extractions!")
            return
    
    analyze_unsuccessful_anchor(anchor_name)

# Interactive analysis - uncomment and modify anchor name as needed
# quick_analyze("somalier_relatedness_plot")

print("Ready for interactive analysis!")
print("Use: quick_analyze('anchor_name') to analyze any specific anchor")
print(f"\\nAvailable unsuccessful anchors ({len(unsuccessful_anchors)}):")
for i, anchor in enumerate(unsuccessful_anchors[:10]):
    print(f"{i+1:2d}. {anchor}")
print("... (and", len(unsuccessful_anchors)-10, "more)")

In [None]:
# Deep dive analysis of different anchor types

# 1. Analyze a table anchor
if table_anchors:
    print("🔍 ANALYZING TABLE ANCHOR:")
    analyze_unsuccessful_anchor(table_anchors[0])
    
print("\\n" + "="*80 + "\\n")

# 2. Analyze a plot anchor  
if plot_anchors:
    print("🔍 ANALYZING PLOT ANCHOR:")
    analyze_unsuccessful_anchor(plot_anchors[0])

print("\\n" + "="*80 + "\\n")

# 3. Analyze a complex viz anchor
if complex_viz_anchors:
    print("🔍 ANALYZING COMPLEX VIZ ANCHOR:")
    analyze_unsuccessful_anchor(complex_viz_anchors[0])

In [None]:
# Let's look at some specific types of unsuccessful anchors

# 1. Check table-like anchors (likely need table extraction patterns)
table_anchors = [a for a in unsuccessful_anchors if 'table' in a.lower()]
print(f"Table-like anchors ({len(table_anchors)}):")
for anchor in table_anchors[:5]:
    print(f"- {anchor}")

print("\\n" + "-"*50)

# 2. Check plot-like anchors  
plot_anchors = [a for a in unsuccessful_anchors if 'plot' in a.lower() and 'table' not in a.lower()]
print(f"\\nPlot-like anchors ({len(plot_anchors)}):")
for anchor in plot_anchors[:5]:
    print(f"- {anchor}")

print("\\n" + "-"*50)

# 3. Check heatmap/PCA anchors (likely complex visualization patterns)
complex_viz_anchors = [a for a in unsuccessful_anchors if any(keyword in a.lower() for keyword in ['heatmap', 'pca', 'scatter'])]
print(f"\\nComplex visualization anchors ({len(complex_viz_anchors)}):")
for anchor in complex_viz_anchors[:5]:
    print(f"- {anchor}")

print("\\n" + "="*50)
print("Let's deep dive into one of each type...")

In [None]:
# Function to analyze structure of unsuccessful anchors
def analyze_unsuccessful_anchor(anchor_name):
    """Analyze the JSON structure of an unsuccessful anchor."""
    
    # Get the raw JSON data for this anchor
    anchor_df = plot_input_df.filter(pl.col("anchor") == anchor_name)
    
    if anchor_df.shape[0] == 0:
        print(f"❌ No data found for anchor: {anchor_name}")
        return None
    
    plot_data_raw = anchor_df.select("plot_input_data").head(1).to_series().to_list()[0]
    
    if not plot_data_raw:
        print(f"❌ No plot_input_data for anchor: {anchor_name}")
        return None
    
    try:
        raw_json = json.loads(plot_data_raw)
        
        print(f"\\n📊 ANALYZING: {anchor_name}")
        print("=" * 50)
        
        # Basic structure analysis
        print(f"JSON Type: {type(raw_json).__name__}")
        
        if isinstance(raw_json, dict):
            print(f"Top-level keys: {list(raw_json.keys())}")
            
            # Check for common MultiQC patterns
            has_data_key = 'data' in raw_json
            has_datasets_key = 'datasets' in raw_json
            has_anchor_key = 'anchor' in raw_json
            
            print(f"Has 'data' key: {has_data_key}")
            print(f"Has 'datasets' key: {has_datasets_key}")
            print(f"Has 'anchor' key: {has_anchor_key}")
            
            # Analyze the main data structure
            if has_data_key:
                data_content = raw_json['data']
                print(f"\\n'data' content type: {type(data_content).__name__}")
                
                if isinstance(data_content, dict):
                    print(f"Data keys: {list(data_content.keys())}")
                    # Sample first few items
                    if len(data_content) > 0:
                        first_key = next(iter(data_content.keys()))
                        print(f"Sample data['{first_key}']: {data_content[first_key]}")
                        
                elif isinstance(data_content, list):
                    print(f"Data list length: {len(data_content)}")
                    if len(data_content) > 0:
                        print(f"First item type: {type(data_content[0]).__name__}")
                        if isinstance(data_content[0], dict):
                            print(f"First item keys: {list(data_content[0].keys())}")
                        print(f"First item sample: {data_content[0]}")
            
            if has_datasets_key:
                datasets_content = raw_json['datasets']
                print(f"\\n'datasets' content type: {type(datasets_content).__name__}")
                
                if isinstance(datasets_content, dict):
                    print(f"Datasets keys (first 5): {list(datasets_content.keys())[:5]}")
                    if len(datasets_content) > 0:
                        first_key = next(iter(datasets_content.keys()))
                        sample_value = datasets_content[first_key]
                        print(f"Sample datasets['{first_key}']: {sample_value}")
            
            # Check for other interesting keys
            other_keys = [k for k in raw_json.keys() if k not in ['data', 'datasets', 'anchor', 'creation_date', 'plot_type', 'pconfig']]
            if other_keys:
                print(f"\\nOther keys: {other_keys}")
                for key in other_keys[:2]:  # Show first 2 other keys
                    print(f"  {key}: {raw_json[key]}")
        
        elif isinstance(raw_json, list):
            print(f"List length: {len(raw_json)}")
            if len(raw_json) > 0:
                print(f"First item type: {type(raw_json[0]).__name__}")
                print(f"First item sample: {raw_json[0]}")
        
        # Try to detect why it failed using our extractor logic
        data_type, structure_info = extractor._analyze_json_structure(raw_json)
        print(f"\\n🔍 EXTRACTOR ANALYSIS:")
        print(f"Detected type: {data_type.value}")
        print(f"Structure info: {structure_info}")
        
        # Try extraction
        samples, data_points = extractor._extract_samples_from_json(raw_json, data_type)
        print(f"\\n📤 EXTRACTION ATTEMPT:")
        print(f"Samples found: {len(samples)}")
        print(f"Data points: {len(data_points)}")
        if len(samples) > 0:
            print(f"Sample names (first 5): {samples[:5]}")
        if len(data_points) > 0:
            print(f"First data point: {data_points[0]}")
        
        return {
            'anchor': anchor_name,
            'json_type': type(raw_json).__name__,
            'structure_info': structure_info,
            'detected_type': data_type.value,
            'samples_found': len(samples),
            'data_points_found': len(data_points),
            'raw_json_preview': raw_json if isinstance(raw_json, (str, int, float, bool)) else str(raw_json)[:200] + "..." if len(str(raw_json)) > 200 else raw_json
        }
        
    except json.JSONDecodeError as e:
        print(f"❌ JSON Parse Error for {anchor_name}: {e}")
        return None

# Let's analyze a few unsuccessful anchors
print("Analyzing first 3 unsuccessful anchors...")
analyses = []

for anchor in unsuccessful_anchors[:3]:
    result = analyze_unsuccessful_anchor(anchor)
    if result:
        analyses.append(result)
    print("\\n" + "="*80 + "\\n")

In [None]:
# Get unsuccessful extractions from our detailed report
unsuccessful_file = "/Users/tweber/Gits/workspaces/depictio-workspace/depictio/dev/dev-multiqc-parquet/detailed_extraction_reports/unsuccessful_extractions/unsuccessful_extractions_list.csv"
unsuccessful_df = pl.read_csv(unsuccessful_file)

print(f"Total unsuccessful anchors: {unsuccessful_df.shape[0]}")
print("\nFirst 10 unsuccessful anchors:")
print(unsuccessful_df.head(10))

# Get list of unsuccessful anchor names
unsuccessful_anchors = unsuccessful_df.select("anchor").to_series().to_list()

print(f"\nSample of unsuccessful anchors:")
for anchor in unsuccessful_anchors[:5]:
    print(f"- {anchor}")

In [None]:
import polars as pl
import json
from multiqc_extractor import MultiQCExtractor, DataType
from pathlib import Path

pl.Config.set_tbl_rows(300)

# Load the full dataset
df = pl.read_parquet("/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData/multiqc_output_complete_v1_30_0/multiqc_data/BETA-multiqc.parquet")

# Initialize extractor
extractor = MultiQCExtractor("/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData/multiqc_output_complete_v1_30_0/multiqc_data/BETA-multiqc.parquet")

print(f"Total rows in dataset: {df.shape[0]}")
print(f"Columns: {df.columns}")

# Get plot_input data only
plot_input_df = df.filter(pl.col("type") == "plot_input")
print(f"\nplot_input rows: {plot_input_df.shape[0]}")

# Get unique anchors
unique_anchors = plot_input_df.select("anchor").unique().to_series().to_list()
print(f"Unique plot_input anchors: {len(unique_anchors)}")

df.head()

In [None]:
import json 
for unsuccessful_anchor in unsuccessful_anchors:
    content = df.filter(pl.col("anchor") == unsuccessful_anchor).select("plot_input_data").head(1).to_series().to_list()[0]
    plot_type = json.loads(content)["plot_type"]
    print(f"unsuccessful_anchor : {unsuccessful_anchor}, plot_type : {plot_type}")

In [None]:
content = df.filter(pl.col("anchor") == unsuccessful_anchors[0]).select("plot_input_data").head(1).to_series().to_list()[0]
json.loads(content)


In [None]:
# Test if previously unsuccessful anchors now work with updated extractor
test_anchors = ['librarian-library-type-plot', 'somalier_relatedness_plot', 'somalier_ancestry_pca_plot']

print("Testing previously unsuccessful anchors:")
for anchor in test_anchors:
    success, result = test_basic_extraction(anchor)
    print(f"\n{anchor}: {'✅ SUCCESS' if success else '❌ FAILED'}")
    if success:
        print(f"  - Samples: {len(result.sample_names)}")
        print(f"  - Data points: {len(result.data_points)}")
        print(f"  - Data type: {result.data_type.value}")
    else:
        print(f"  - Reason: {result}")

In [None]:
# Updated extractor with heatmap pattern support
from multiqc_extractor import MultiQCExtractor

# Reinitialize extractor to get latest changes
extractor = MultiQCExtractor("/Users/tweber/Gits/workspaces/MultiQC-MegaQC/MultiQC_TestData/multiqc_output_complete_v1_30_0/multiqc_data/BETA-multiqc.parquet")

# Test the heatmap case specifically
anchor = "librarian-library-type-plot"
test_data = extractor.extract_plot_input_data()
librarian_data = None

for data in test_data:
    if data.anchor == anchor:
        librarian_data = data
        break

if librarian_data:
    print(f"✅ librarian-library-type-plot extraction:")
    print(f"  - Data type: {librarian_data.data_type.value}")
    print(f"  - Samples found: {len(librarian_data.sample_names)}")
    print(f"  - Sample names: {librarian_data.sample_names}")
    print(f"  - Data points: {len(librarian_data.data_points)}")
    print(f"  - First few data points:")
    for dp in librarian_data.data_points[:5]:
        print(f"    {dp}")
else:
    print("❌ librarian-library-type-plot not found in extraction results")

In [None]:
# Debug the librarian data structure
anchor = "librarian-library-type-plot"
plot_df = extractor.df.filter(pl.col("anchor") == anchor)
plot_df = plot_df.filter(pl.col("type") == "plot_input")

if plot_df.shape[0] > 0:
    plot_data_raw = plot_df.select("plot_input_data").head(1).to_series().to_list()[0]
    
    if plot_data_raw:
        import json
        raw_json = json.loads(plot_data_raw)
        
        print("Raw JSON structure for librarian-library-type-plot:")
        print(f"Top level keys: {list(raw_json.keys())}")
        print(f"Has 'rows': {'rows' in raw_json}")
        print(f"Has 'xcats': {'xcats' in raw_json}")
        print(f"Has 'ycats': {'ycats' in raw_json}")
        
        if 'rows' in raw_json:
            print(f"Rows type: {type(raw_json['rows'])}, length: {len(raw_json['rows'])}")
            print(f"First row: {raw_json['rows'][0] if raw_json['rows'] else 'None'}")
        
        if 'xcats' in raw_json:
            print(f"Xcats: {raw_json['xcats'][:5]}...")
        
        if 'ycats' in raw_json:
            print(f"Ycats: {raw_json['ycats']}")
            
        # Test the analyze function directly
        from multiqc_extractor import DataType
        data_type, metadata = extractor._analyze_json_structure(raw_json)
        print(f"\nAnalyzed data type: {data_type.value}")
        print(f"Metadata: {metadata}")
else:
    print("No plot_input data found for this anchor")

In [None]:
# Test if previously unsuccessful anchors are now working with our updated extractor
print("🔍 Testing previously unsuccessful anchors with updated extractor:")
print("="*60)

# Test the first few anchors that were in our unsuccessful list  
test_anchors_to_check = ['librarian-library-type-plot', 'somalier_relatedness_plot', 'somalier_ancestry_pca_plot']

for anchor in test_anchors_to_check:
    print(f"\n📊 Testing: {anchor}")
    
    # Get the extraction result using our updated extractor
    anchor_df = plot_input_df.filter(pl.col("anchor") == anchor)
    if anchor_df.shape[0] > 0:
        plot_data_raw = anchor_df.select("plot_input_data").head(1).to_series().to_list()[0]
        if plot_data_raw:
            try:
                raw_json = json.loads(plot_data_raw)
                
                # Test with updated extractor
                data_type, structure_info = extractor._analyze_json_structure(raw_json)
                samples, data_points = extractor._extract_samples_from_json(raw_json, data_type)
                
                if len(samples) > 0 and len(data_points) > 0:
                    print(f"✅ NOW WORKING! Type: {data_type.value}")
                    print(f"   Samples: {len(samples)}, Data points: {len(data_points)}")
                    print(f"   Structure: {structure_info.get('structure', 'unknown')}")
                    
                    # Show a few sample data points
                    print(f"   Sample data points:")
                    for i, point in enumerate(data_points[:2]):
                        print(f"     {i+1}. {point}")
                else:
                    print(f"❌ Still not working - Type: {data_type.value}")
                    print(f"   Samples: {len(samples)}, Data points: {len(data_points)}")
                    
            except Exception as e:
                print(f"❌ Error: {e}")
    else:
        print(f"❌ No data found for {anchor}")

print(f"\n🎯 Summary: Testing if the extractor improvements are working!")