# Identifying Missing Data

This script aims to iterate through a data folder and identify missing parameter combinations, assuming that all combinatorial possibilities must be present

## Parsing filenames

In [36]:

import re
import os

# no_silence_parsed_counts = 0 # Keeping track of which pattern was matched
# silence_parsed_counts = 0 # Keeping track of which pattern was matched

# Updated regex pattern to handle both old and new filename formats
pattern = (
    r"EndSim_epsM(-?[\d\.]+)_epsSD(-?[\d\.]+)"
    r"___OpD([a-zA-Z-]+)_OpM(-?[\d\.]+)_OpSD(-?[\d\.]+)"
    r"___Net([a-zA-Z-]+)___NAgents(\d+)___RS(\d+)"
    r"__MedInfF([\d\.]+)___MedD([a-zA-Z-]+)_MedN(\d+)_MedM(-?[\d\.]+)_MedSD(-?[\d\.]+)"
    r"(_SA(-?[\d\.]+)_ST(-?[\d\.]+)_SDO(-?[\d\.]+)_SBB([a-zA-Z]+))?"
)

def parse_filename(filename):
    """Extract parameters from filename using regex with optional silence parameters"""
    # Remove .csv extension before parsing
    base_name = os.path.splitext(filename)[0]
    match = re.search(pattern, base_name)
    
    if match:
        result = {
            # 'filename': filename,
            'epsM': float(match.group(1)),
            'epsSD': float(match.group(2)),
            'OpD': match.group(3),
            'OpM': float(match.group(4)),
            'OpSD': float(match.group(5)),
            'NetworkType': match.group(6),
            'NAgents': int(match.group(7)),
            'RandomSeed': int(match.group(8)),
            'MedInfF': float(match.group(9)),
            'MedD': match.group(10),
            'MedN': int(match.group(11)),
            'MedM': float(match.group(12)),
            'MedSD': float(match.group(13))
        }
        # Check if silence parameters are present
        if match.group(14):  # The entire silence block exists
            # silence_parsed_counts = silence_parsed_counts + 1 # increment the silence parsed counter
            result.update({
                'Silence_Alpha': float(match.group(15)),
                'Silence_Tau': float(match.group(16)),
                'Silence_Delta0': float(match.group(17)),
                'SilenceByBoundary': match.group(18).lower() == 'true'
            })
        else:
            # no_silence_parsed_counts = no_silence_parsed_counts + 1
            # Set default values for silence parameters if not present
            result.update({
                'Silence_Alpha': None,
                'Silence_Tau': None,
                'Silence_Delta0': None,
                'SilenceByBoundary': None
            })
        
        return result
    return None

# Test the parser
test_filenames = [
    "EndSim_epsM0.01_epsSD0.1___OpDnormal_OpM0_OpSD0.8___NetScale-free___NAgents1000___RS1__MedInfF0.5___MedDdeterministic-normal_MedN10_MedM0.75_MedSD0.1.csv",
    "EndSim_epsM0.01_epsSD0.1___OpDnormal_OpM0_OpSD0.8___NetScale-free___NAgents1000___RS1__MedInfF0.5___MedDdeterministic-normal_MedN10_MedM0.75_MedSD0.1_SA0.1_ST1_SDO0.75_SBBfalse.csv",
    "EndSim_epsM0.01_epsSD0.1___OpDnormal_OpM0_OpSD0.8___NetScale-free___NAgents1000___RS1__MedInfF0.5___MedDdeterministic-normal_MedN10_MedM0.75_MedSD0.1_SA0.1_ST1_SDO-0.75_SBBtrue.csv"
]

for filename in test_filenames:
    params = parse_filename(filename)
    print(f"Filename: {filename}")
    print(f"Parsed: {params}\n")

Filename: EndSim_epsM0.01_epsSD0.1___OpDnormal_OpM0_OpSD0.8___NetScale-free___NAgents1000___RS1__MedInfF0.5___MedDdeterministic-normal_MedN10_MedM0.75_MedSD0.1.csv
Parsed: {'epsM': 0.01, 'epsSD': 0.1, 'OpD': 'normal', 'OpM': 0.0, 'OpSD': 0.8, 'NetworkType': 'Scale-free', 'NAgents': 1000, 'RandomSeed': 1, 'MedInfF': 0.5, 'MedD': 'deterministic-normal', 'MedN': 10, 'MedM': 0.75, 'MedSD': 0.1, 'Silence_Alpha': None, 'Silence_Tau': None, 'Silence_Delta0': None, 'SilenceByBoundary': None}

Filename: EndSim_epsM0.01_epsSD0.1___OpDnormal_OpM0_OpSD0.8___NetScale-free___NAgents1000___RS1__MedInfF0.5___MedDdeterministic-normal_MedN10_MedM0.75_MedSD0.1_SA0.1_ST1_SDO0.75_SBBfalse.csv
Parsed: {'epsM': 0.01, 'epsSD': 0.1, 'OpD': 'normal', 'OpM': 0.0, 'OpSD': 0.8, 'NetworkType': 'Scale-free', 'NAgents': 1000, 'RandomSeed': 1, 'MedInfF': 0.5, 'MedD': 'deterministic-normal', 'MedN': 10, 'MedM': 0.75, 'MedSD': 0.1, 'Silence_Alpha': 0.1, 'Silence_Tau': 1.0, 'Silence_Delta0': 0.75, 'SilenceByBoundary': Fa

In [45]:
import os
import pandas as pd
from itertools import product
import numpy as np

fileofinterest = ""

def analyze_simulation_coverage(directory):
    """Analyze which parameter combinations are missing from the simulation files"""
    
    # Get all CSV files in the directory
    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    
    # Parse all files and collect parameters
    all_params = []
    valid_files = 0
    
    for filename in csv_files:
        params = parse_filename(filename)
        if params:
            all_params.append(params)
            valid_files += 1
        else:
            print(f"Warning: Could not parse {filename}")
    
    print(f"Successfully parsed {valid_files}/{len(csv_files)} files")
    
    if not all_params:
        print("No valid files found!")
        return
    
    # Convert to DataFrame for easier analysis
    df = pd.DataFrame(all_params)
    
    # Identify which files have silence parameters
    has_silence_params = ~df['Silence_Alpha'].isna()
    print(f"\nFiles with silence parameters: {has_silence_params.sum()}")
    print(f"Files without silence parameters: {(~has_silence_params).sum()}")
    print(f"File index without silence: {np.where(has_silence_params == False)}")
    # fileofinterest = (df.iloc[np.where(has_silence_params == False)]['filename'])
    
    # Get unique values for each parameter
    unique_values = {}
    for col in df.columns:
        if col not in ['RandomSeed']:  # Exclude random seed from combination analysis
            unique_vals = sorted(df[col].dropna().unique())
            unique_values[col] = unique_vals
            print(f"\n{col}: {len(unique_vals)} unique values")
            print(f"  Values: {unique_vals}")
    
    # Define the expected parameter combinations (your simulation design)
    # Adjust these ranges based on your intended experimental design
    expected_combinations = {
        'epsM': [0.01, 0.06, 0.11, 0.16, 0.21, 0.26],
        'epsSD': [0, 0.05, 0.1, 0.15],
        'OpM': [0],
        'OpSD': [0.8],
        'MedInfF': [0, 0.5, 1],
        'MedM': [-0.75, 0, 0.75],
        'MedSD': [0.1, 0.4, 0.7, 1],
        'Silence_Alpha': [0.1, 0.4, 0.7, 1],
        'Silence_Tau': [1, 5, 9],
        'Silence_Delta0': [-0.75, 0, 0.75],
        'SilenceByBoundary': [True, False],
        'RandomSeed': [1, 2]
    }
    
    # Filter expected combinations to only include parameters that exist in your data
    actual_expected = {}
    for param, values in expected_combinations.items():
        if param in df.columns:
            # Only include values that are relevant (non-NaN values in data)
            if param in ['Silence_Alpha', 'Silence_Tau', 'Silence_Delta0', 'SilenceByBoundary']:
                # For silence parameters, check if they exist in any file
                if has_silence_params.any():
                    actual_expected[param] = values
            else:
                actual_expected[param] = values
    
    # Generate all possible combinations
    param_names = list(actual_expected.keys())
    all_possible_combinations = list(product(*[actual_expected[name] for name in param_names]))
    
    print(f"\nExpected total combinations: {len(all_possible_combinations)}")
    
    # Check which combinations exist
    existing_combinations = set()
    for _, row in df.iterrows():
        combo = tuple(row[param] if pd.notna(row[param]) else None for param in param_names)
        existing_combinations.add(combo)
    
    print(f"Existing combinations: {len(existing_combinations)}")
    print(f"Missing combinations: {len(all_possible_combinations) - len(existing_combinations)}")
    
    # Identify missing combinations
    missing_combinations = []
    for combo in all_possible_combinations:
        if combo not in existing_combinations:
            missing_combinations.append(dict(zip(param_names, combo)))
    
    # Analyze missing combinations by parameter
    print("\n=== MISSING COMBINATIONS ANALYSIS ===")
    
    if missing_combinations:
        missing_df = pd.DataFrame(missing_combinations)
        
        print("\nMissing combinations by parameter:")
        for param in param_names:
            missing_count = len(missing_df)
            total_expected = len(actual_expected[param])
            missing_values = missing_df[param].unique()
            
            print(f"\n{param}:")
            print(f"  Missing: {len(missing_values)}/{total_expected} values")
            print(f"  Missing values: {sorted(missing_values)}")
            
            # Show which specific values are most missing
            value_counts = missing_df[param].value_counts()
            print(f"  Distribution of missing values:")
            for value, count in value_counts.items():
                percentage = (count / len(missing_df)) * 100
                print(f"    {value}: {count} combinations ({percentage:.1f}%)")
    
        # Save missing combinations to CSV for reference
        missing_df.to_csv(os.path.join(directory, 'missing_simulations.csv'), index=False)
        print(f"\nMissing combinations saved to: {os.path.join(directory, 'missing_simulations.csv')}")
        
        # Generate a summary of what to run next
        print("\n=== RECOMMENDATIONS FOR NEXT SIMULATIONS ===")
        
        # Find the most critical missing parameters
        critical_params = []
        for param in ['epsM', 'epsSD', 'OpM', 'OpSD']:  # Core parameters
            if param in param_names:
                missing_vals = missing_df[param].unique()
                if len(missing_vals) > 0:
                    critical_params.append((param, missing_vals))
        
        if critical_params:
            print("Priority: Focus on completing core parameter combinations:")
            for param, values in critical_params:
                print(f"  {param}: {values}")
        
        # Check if silence parameters need completion
        silence_params = ['Silence_Alpha', 'Silence_Tau', 'Silence_Delta0', 'SilenceByBoundary']
        missing_silence = [p for p in silence_params if p in param_names and len(missing_df[p].unique()) > 0]
        
        if missing_silence:
            print("\nSilence parameters needing completion:")
            for param in missing_silence:
                missing_vals = missing_df[param].unique()
                print(f"  {param}: {missing_vals}")
    
    else:
        print("All expected combinations are present! 🎉")
    
    return df, missing_combinations

# Usage
if __name__ == "__main__":
    directory_path = "data/cluster/endsim/Step6_2reps"
    df, missing = analyze_simulation_coverage(directory_path)

Successfully parsed 102992/102993 files

Files with silence parameters: 102992
Files without silence parameters: 0
File index without silence: (array([], dtype=int64),)

epsM: 6 unique values
  Values: [np.float64(0.01), np.float64(0.06), np.float64(0.11), np.float64(0.16), np.float64(0.21), np.float64(0.26)]

epsSD: 4 unique values
  Values: [np.float64(0.0), np.float64(0.05), np.float64(0.1), np.float64(0.15)]

OpD: 1 unique values
  Values: ['normal']

OpM: 1 unique values
  Values: [np.float64(0.0)]

OpSD: 1 unique values
  Values: [np.float64(0.8)]

NetworkType: 1 unique values
  Values: ['Scale-free']

NAgents: 1 unique values
  Values: [np.int64(1000)]

MedInfF: 3 unique values
  Values: [np.float64(0.0), np.float64(0.5), np.float64(1.0)]

MedD: 1 unique values
  Values: ['deterministic-normal']

MedN: 1 unique values
  Values: [np.int64(10)]

MedM: 3 unique values
  Values: [np.float64(-0.75), np.float64(0.0), np.float64(0.75)]

MedSD: 4 unique values
  Values: [np.float64(0.1)

NameError: name 'has_silence_params' is not defined

## Expected combinations

```    
expected_combinations = {
        'epsM': [0.01, 0.06, 0.11, 0.16, 0.21, 0.26],
        'epsSD': [0.05, 0.1, 0.15],
        'OpM': [0],
        'OpSD': [0.8],
        'MedInfF': [0, 0.5, 1],
        'MedM': [-0.75, 0, 0.75],
        'MedSD': [0.1, 0.4, 0.7, 1],
        'Silence_Alpha': [0.1, 0.4, 0.7, 1],
        'Silence_Tau': [1, 5, 9],
        'Silence_Delta0': [-0.75, 0, 0.75],
        'SilenceByBoundary': [True, False]
    }
```