# Identifying Missing Data

This script aims to iterate through a data folder and identify missing parameter combinations, assuming that all combinatorial possibilities must be present

## Parsing filenames

In [36]:

import re
import os

# no_silence_parsed_counts = 0 # Keeping track of which pattern was matched
# silence_parsed_counts = 0 # Keeping track of which pattern was matched

# Updated regex pattern to handle both old and new filename formats
pattern = (
    r"EndSim_epsM(-?[\d\.]+)_epsSD(-?[\d\.]+)"
    r"___OpD([a-zA-Z-]+)_OpM(-?[\d\.]+)_OpSD(-?[\d\.]+)"
    r"___Net([a-zA-Z-]+)___NAgents(\d+)___RS(\d+)"
    r"__MedInfF([\d\.]+)___MedD([a-zA-Z-]+)_MedN(\d+)_MedM(-?[\d\.]+)_MedSD(-?[\d\.]+)"
    r"(_SA(-?[\d\.]+)_ST(-?[\d\.]+)_SDO(-?[\d\.]+)_SBB([a-zA-Z]+))?"
)

def parse_filename(filename):
    """Extract parameters from filename using regex with optional silence parameters"""
    # Remove .csv extension before parsing
    base_name = os.path.splitext(filename)[0]
    match = re.search(pattern, base_name)
    
    if match:
        result = {
            # 'filename': filename,
            'epsM': float(match.group(1)),
            'epsSD': float(match.group(2)),
            'OpD': match.group(3),
            'OpM': float(match.group(4)),
            'OpSD': float(match.group(5)),
            'NetworkType': match.group(6),
            'NAgents': int(match.group(7)),
            'RandomSeed': int(match.group(8)),
            'MedInfF': float(match.group(9)),
            'MedD': match.group(10),
            'MedN': int(match.group(11)),
            'MedM': float(match.group(12)),
            'MedSD': float(match.group(13))
        }
        # Check if silence parameters are present
        if match.group(14):  # The entire silence block exists
            # silence_parsed_counts = silence_parsed_counts + 1 # increment the silence parsed counter
            result.update({
                'Silence_Alpha': float(match.group(15)),
                'Silence_Tau': float(match.group(16)),
                'Silence_Delta0': float(match.group(17)),
                'SilenceByBoundary': match.group(18).lower() == 'true'
            })
        else:
            # no_silence_parsed_counts = no_silence_parsed_counts + 1
            # Set default values for silence parameters if not present
            result.update({
                'Silence_Alpha': None,
                'Silence_Tau': None,
                'Silence_Delta0': None,
                'SilenceByBoundary': None
            })
        
        return result
    return None

# Test the parser
test_filenames = [
    "EndSim_epsM0.01_epsSD0.1___OpDnormal_OpM0_OpSD0.8___NetScale-free___NAgents1000___RS1__MedInfF0.5___MedDdeterministic-normal_MedN10_MedM0.75_MedSD0.1.csv",
    "EndSim_epsM0.01_epsSD0.1___OpDnormal_OpM0_OpSD0.8___NetScale-free___NAgents1000___RS1__MedInfF0.5___MedDdeterministic-normal_MedN10_MedM0.75_MedSD0.1_SA0.1_ST1_SDO0.75_SBBfalse.csv",
    "EndSim_epsM0.01_epsSD0.1___OpDnormal_OpM0_OpSD0.8___NetScale-free___NAgents1000___RS1__MedInfF0.5___MedDdeterministic-normal_MedN10_MedM0.75_MedSD0.1_SA0.1_ST1_SDO-0.75_SBBtrue.csv"
]

for filename in test_filenames:
    params = parse_filename(filename)
    print(f"Filename: {filename}")
    print(f"Parsed: {params}\n")

Filename: EndSim_epsM0.01_epsSD0.1___OpDnormal_OpM0_OpSD0.8___NetScale-free___NAgents1000___RS1__MedInfF0.5___MedDdeterministic-normal_MedN10_MedM0.75_MedSD0.1.csv
Parsed: {'epsM': 0.01, 'epsSD': 0.1, 'OpD': 'normal', 'OpM': 0.0, 'OpSD': 0.8, 'NetworkType': 'Scale-free', 'NAgents': 1000, 'RandomSeed': 1, 'MedInfF': 0.5, 'MedD': 'deterministic-normal', 'MedN': 10, 'MedM': 0.75, 'MedSD': 0.1, 'Silence_Alpha': None, 'Silence_Tau': None, 'Silence_Delta0': None, 'SilenceByBoundary': None}

Filename: EndSim_epsM0.01_epsSD0.1___OpDnormal_OpM0_OpSD0.8___NetScale-free___NAgents1000___RS1__MedInfF0.5___MedDdeterministic-normal_MedN10_MedM0.75_MedSD0.1_SA0.1_ST1_SDO0.75_SBBfalse.csv
Parsed: {'epsM': 0.01, 'epsSD': 0.1, 'OpD': 'normal', 'OpM': 0.0, 'OpSD': 0.8, 'NetworkType': 'Scale-free', 'NAgents': 1000, 'RandomSeed': 1, 'MedInfF': 0.5, 'MedD': 'deterministic-normal', 'MedN': 10, 'MedM': 0.75, 'MedSD': 0.1, 'Silence_Alpha': 0.1, 'Silence_Tau': 1.0, 'Silence_Delta0': 0.75, 'SilenceByBoundary': Fa

In [40]:
import re
import os
import pandas as pd

# Your updated regex pattern
pattern = (
    r"EndSim_epsM(-?[\d\.]+)_epsSD(-?[\d\.]+)"
    r"___OpD([a-zA-Z-]+)_OpM(-?[\d\.]+)_OpSD(-?[\d\.]+)"
    r"___Net([a-zA-Z-]+)___NAgents(\d+)___RS(\d+)"
    r"__MedInfF([\d\.]+)___MedD([a-zA-Z-]+)_MedN(\d+)_MedM(-?[\d\.]+)_MedSD(-?[\d\.]+)"
    r"(_SA(-?[\d\.]+)_ST(-?[\d\.]+)_SDO(-?[\d\.]+)_SBB([a-zA-Z]+))?"
)

def parse_filename_with_detailed_debug(filename):
    """Extract parameters from filename with detailed debugging"""
    base_name = os.path.splitext(filename)[0]
    # print(f"\n🔍 Analyzing: {filename}")
    # print(f"   Base name: {base_name}")
    
    match = re.search(pattern, base_name)
    
    if not match:
        print("   ❌ NO MATCH with main pattern")
        return None
    
    # Check if silence parameters are present
    has_silence = match.group(14) is not None
    
    result = {
        'epsM': float(match.group(1)),
        'epsSD': float(match.group(2)),
        'OpD': match.group(3),
        'OpM': float(match.group(4)),
        'OpSD': float(match.group(5)),
        'NetworkType': match.group(6),
        'NAgents': int(match.group(7)),
        'RandomSeed': int(match.group(8)),
        'MedInfF': float(match.group(9)),
        'MedD': match.group(10),
        'MedN': int(match.group(11)),
        'MedM': float(match.group(12)),
        'MedSD': float(match.group(13)),
        'has_silence_params': has_silence,
        'filename': filename
    }
    
    if has_silence:
        result.update({
            'Silence_Alpha': float(match.group(15)),
            'Silence_Tau': float(match.group(16)),
            'Silence_Delta0': float(match.group(17)),
            'SilenceByBoundary': match.group(18).lower() == 'true'
        })
        # print("   ✅ SILENCE PARAMETERS DETECTED")
        print(f"      SA: {match.group(15)}, ST: {match.group(16)}, SDO: {match.group(17)}, SBB: {match.group(18)}")
    else:
        result.update({
            'Silence_Alpha': None,
            'Silence_Tau': None,
            'Silence_Delta0': None,
            'SilenceByBoundary': None
        })
        # print("   ⚠️ NO SILENCE PARAMETERS DETECTED")
        
        # Debug why silence parameters aren't being detected
        debug_silence_detection(base_name)
    
    return result

def debug_silence_detection(base_name):
    """Debug why silence parameters aren't being detected"""
    print("   🔧 Debugging silence detection...")
    
    # Test individual components
    silence_patterns = {
        'Full silence block': r"_SA[\d\.]+_ST[\d\.]+_SDO[\d\.]+_SBB[a-zA-Z]+",
        'SA only': r"_SA[\d\.]+",
        'ST only': r"_ST[\d\.]+", 
        'SDO only': r"_SDO[\d\.]+",
        'SBB only': r"_SBB[a-zA-Z]+"
    }
    
    for pattern_name, test_pattern in silence_patterns.items():
        match = re.search(test_pattern, base_name)
        if match:
            print(f"      ✅ {pattern_name}: FOUND -> {match.group()}")
        else:
            print(f"      ❌ {pattern_name}: NOT FOUND")
    
    # Check what comes after MedSD
    medsd_section = base_name.split('_MedSD')
    if len(medsd_section) > 1:
        after_medsd = medsd_section[1]
        print(f"   📍 After _MedSD: '{after_medsd}'")
        
        # Check if there's anything that looks like silence parameters but with different formatting
        if any(x in after_medsd for x in ['SA', 'ST', 'SDO', 'SBB']):
            print("   💡 Found potential silence parameter abbreviations in the filename!")
            
        # Show the exact string that should match the silence pattern
        silence_match_candidate = after_medsd.split('___')[0] if '___' in after_medsd else after_medsd
        print(f"   🔍 String that should match silence pattern: '{silence_match_candidate}'")

def analyze_directory_comprehensive(directory):
    """Comprehensive analysis of all files in directory"""
    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    print(f"📁 Found {len(csv_files)} CSV files in directory: {directory}")
    
    all_params = []
    files_with_silence = []
    files_without_silence = []
    failed_files = []
    
    for i, filename in enumerate(csv_files, 1):
        # print(f"\n{'='*80}")
        # print(f"Processing file {i}/{len(csv_files)}")
        
        params = parse_filename_with_detailed_debug(filename)
        
        if params:
            all_params.append(params)
            if params['has_silence_params']:
                files_with_silence.append(filename)
            else:
                files_without_silence.append(filename)
        else:
            failed_files.append(filename)
    
    # Generate comprehensive report
    print(f"\n{'='*80}")
    print("📊 COMPREHENSIVE ANALYSIS REPORT")
    print(f"{'='*80}")
    
    print(f"\n📈 SUMMARY STATISTICS:")
    print(f"   Total files: {len(csv_files)}")
    print(f"   Successfully parsed: {len(all_params)}")
    print(f"   Files with silence parameters: {len(files_with_silence)}")
    print(f"   Files without silence parameters: {len(files_without_silence)}")
    print(f"   Failed to parse: {len(failed_files)}")
    
    if files_without_silence:
        print(f"\n🔍 FILES WITHOUT SILENCE PARAMETERS (that should have them):")
        print(f"   Count: {len(files_without_silence)}")
        print(f"   Files:")
        for i, filename in enumerate(files_without_silence, 1):
            print(f"     {i:2d}. {filename}")
            
            # Show the problematic part of the filename
            base_name = os.path.splitext(filename)[0]
            medsd_section = base_name.split('_MedSD')
            if len(medsd_section) > 1:
                after_medsd = medsd_section[1].split('___')[0] if '___' in medsd_section[1] else medsd_section[1]
                print(f"         After MedSD: '{after_medsd}'")
    
    if failed_files:
        print(f"\n❌ FILES THAT FAILED TO PARSE:")
        for i, filename in enumerate(failed_files, 1):
            print(f"   {i:2d}. {filename}")
    
    # Convert to DataFrame for further analysis
    if all_params:
        df = pd.DataFrame(all_params)
        
        # Analyze silence parameters
        if 'Silence_Alpha' in df.columns:
            silence_stats = df[['Silence_Alpha', 'Silence_Tau', 'Silence_Delta0', 'SilenceByBoundary']].describe()
            print(f"\n📊 SILENCE PARAMETER STATISTICS:")
            print(silence_stats)
        
        return df, files_with_silence, files_without_silence, failed_files
    
    return None, files_with_silence, files_without_silence, failed_files

# Test specific problematic files
def test_specific_files(file_list, directory):
    """Test specific files that are causing issues"""
    print(f"\n{'='*80}")
    print("🧪 TESTING SPECIFIC PROBLEMATIC FILES")
    print(f"{'='*80}")
    
    for filename in file_list:
        filepath = os.path.join(directory, filename)
        if os.path.exists(filepath):
            parse_filename_with_detailed_debug(filename)
        else:
            print(f"File not found: {filename}")

# Usage
if __name__ == "__main__":
    directory_path = "data/cluster/endsim/Step6_2reps"
    
    # Run comprehensive analysis
    df, with_silence, without_silence, failed = analyze_directory_comprehensive(directory_path)
    
    # If there are files without silence parameters, test them specifically
    if without_silence:
        print(f"\n{'='*80}")
        print("🔬 DETAILED ANALYSIS OF FILES WITHOUT SILENCE PARAMETERS")
        print(f"{'='*80}")
        test_specific_files(without_silence[:5], directory_path)  # Test first 5 files

📁 Found 102994 CSV files in directory: data/cluster/endsim/Step6_2reps

Processing file 1/102994

🔍 Analyzing: EndSim_epsM0.01_epsSD0.05___OpDnormal_OpM0_OpSD0.8___NetScale-free___NAgents1000___RS1__MedInfF0.5___MedDdeterministic-normal_MedN10_MedM-0.75_MedSD0.1_SA0.1_ST1_SDO-0.75_SBBfalse.csv
   Base name: EndSim_epsM0.01_epsSD0.05___OpDnormal_OpM0_OpSD0.8___NetScale-free___NAgents1000___RS1__MedInfF0.5___MedDdeterministic-normal_MedN10_MedM-0.75_MedSD0.1_SA0.1_ST1_SDO-0.75_SBBfalse
      SA: 0.1, ST: 1, SDO: -0.75, SBB: false

Processing file 2/102994

🔍 Analyzing: EndSim_epsM0.01_epsSD0.05___OpDnormal_OpM0_OpSD0.8___NetScale-free___NAgents1000___RS1__MedInfF0.5___MedDdeterministic-normal_MedN10_MedM-0.75_MedSD0.1_SA0.1_ST1_SDO-0.75_SBBtrue.csv
   Base name: EndSim_epsM0.01_epsSD0.05___OpDnormal_OpM0_OpSD0.8___NetScale-free___NAgents1000___RS1__MedInfF0.5___MedDdeterministic-normal_MedN10_MedM-0.75_MedSD0.1_SA0.1_ST1_SDO-0.75_SBBtrue
      SA: 0.1, ST: 1, SDO: -0.75, SBB: true

Proce

KeyboardInterrupt: 

NameError: name 'has_silence_params' is not defined

## Expected combinations

```    
expected_combinations = {
        'epsM': [0.01, 0.06, 0.11, 0.16, 0.21, 0.26],
        'epsSD': [0.05, 0.1, 0.15],
        'OpM': [0],
        'OpSD': [0.8],
        'MedInfF': [0, 0.5, 1],
        'MedM': [-0.75, 0, 0.75],
        'MedSD': [0.1, 0.4, 0.7, 1],
        'Silence_Alpha': [0.1, 0.4, 0.7, 1],
        'Silence_Tau': [1, 5, 9],
        'Silence_Delta0': [-0.75, 0, 0.75],
        'SilenceByBoundary': [True, False]
    }
```