In [6]:
import pandas as pd
import numpy as np

# Read the CSV file
data = pd.read_csv('net2brain/architectures/taxonomy.csv')

# Identify where actual data begins (after header rows)
data_start = 1  # Based on your sample, data starts from row index 1

# Get the actual data rows
model_data = data.iloc[data_start:].copy()

# Function to check if a value represents an 'x' marking
def is_marked(val):
    return val == 'x' or val == 'X'

# Function to count models with a specific attribute
def count_by_attribute(column_name):
    return model_data[column_name].apply(is_marked).sum()

# Function to get example models for a category
def get_example_models(column_name, limit=3):
    mask = model_data[column_name].apply(is_marked)
    examples = model_data[mask]['Model'].head(limit)
    if len(examples) == 0:
        return "N/A"
    return ", ".join(examples.astype(str).values)

# Count architecture types
arch_counts = {
    "CNN": count_by_attribute("Convolutional Neural Network"),
    "Transformer-based": count_by_attribute("Transformer-based Models"),
    "Vision Transformer": count_by_attribute("Vision Transformer"),
    "Swin-Transformer": count_by_attribute("Swin-Transformer"),
    "MLP-Mixer": count_by_attribute("MLP-Mixer")
}

# Count multimodal models
multimodal_count = count_by_attribute("Multimodal")

# Count task types
task_counts = {
    "Image Classification": count_by_attribute("Image Classification"),
    "Object Detection": count_by_attribute("Object Detection"),
    "Panoptic Segmentation": count_by_attribute("Panoptic Segmentation"),
    "Semantic Segmentation": count_by_attribute("Semantic Segmentation"),
    "Instance Segmentation": count_by_attribute("Instance Segmentation"),
    "Keypoint Detection": count_by_attribute("Keypoint Detection"),
    "Pose Estimation": count_by_attribute("Pose Estimation"),
    "Video Classification": count_by_attribute("Video Classification"),
    "Natural Language Processing": count_by_attribute("Natural Language Processing"),
    "Audio Tagging": count_by_attribute("Audio Tagging"),
    "Various visual tasks": count_by_attribute("Various visual tasks")
}

# Calculate combined segmentation count
seg_columns = ["Panoptic Segmentation", "Semantic Segmentation", "Instance Segmentation"]
segmentation_models = set()
for col in seg_columns:
    # Get indices of models marked for this segmentation type
    marked_indices = model_data[model_data[col].apply(is_marked)].index
    segmentation_models.update(marked_indices)
segmentation_count = len(segmentation_models)

# Count training paradigm types
paradigm_counts = {
    "Supervised": count_by_attribute("Supervised"),
    "Jigsaw": count_by_attribute("Jigsaw"),
    "NPID": count_by_attribute("NPID"),
    "RotNet": count_by_attribute("RotNet"),
    "Clusterfit": count_by_attribute("Clusterfit"),
    "Deepcluster": count_by_attribute("Deepcluser"),  # Note: typo in column name
    "SimCLR": count_by_attribute("SimCLR"),
    "SwAV": count_by_attribute("SwAV"),
    "MoCo": count_by_attribute("MoCo"),
    "CLIP": count_by_attribute("Contrastive Language Image Pre-Training")
}

# Calculate combined self-supervised count
ss_methods = ["Jigsaw", "NPID", "RotNet", "Clusterfit", "Deepcluser", "SimCLR", "SwAV", "MoCo"]
ss_models = set()
for method in ss_methods:
    # Get indices of models marked for this self-supervised method
    marked_indices = model_data[model_data[method].apply(is_marked)].index
    ss_models.update(marked_indices)
self_supervised_count = len(ss_models)

# Print counts for debugging
print("=== ARCHITECTURE COUNTS ===")
for arch, count in arch_counts.items():
    print(f"{arch}: {count}")
print(f"Multimodal: {multimodal_count}")

print("\n=== TASK COUNTS ===")
for task, count in task_counts.items():
    print(f"{task}: {count}")
print(f"Combined Segmentation: {segmentation_count}")

print("\n=== TRAINING PARADIGM COUNTS ===")
for paradigm, count in paradigm_counts.items():
    print(f"{paradigm}: {count}")
print(f"Combined Self-supervised: {self_supervised_count}")

# Print total model count
print(f"\nTotal models: {len(model_data)}")

# Generate Table 1: Hierarchical Summary Table
table1_data = {
    "Architecture Type": [],
    "Count": [],
    "Examples": [],
    "Typical Applications in Neuroscience": []
}

neuroscience_applications = {
    "CNN": "Visual cortex modeling, object recognition",
    "Transformer-based": "Complex scene understanding, hierarchical processing",
    "Vision Transformer": "Visual attention mechanisms, global feature extraction",
    "Swin-Transformer": "Hierarchical visual processing, local-global integration",
    "MLP-Mixer": "Visual feature extraction without convolutions",
    "Multimodal": "Cross-modal integration studies"
}

for arch, count in arch_counts.items():
    if count > 0:
        table1_data["Architecture Type"].append(arch)
        table1_data["Count"].append(count)
        
        # Get examples based on architecture type
        if arch == "CNN":
            examples = get_example_models("Convolutional Neural Network")
        elif arch == "Transformer-based":
            examples = get_example_models("Transformer-based Models")
        elif arch == "Vision Transformer":
            examples = get_example_models("Vision Transformer")
        elif arch == "Swin-Transformer":
            examples = get_example_models("Swin-Transformer")
        elif arch == "MLP-Mixer":
            examples = get_example_models("MLP-Mixer")
        else:
            examples = "N/A"
            
        table1_data["Examples"].append(examples)
        table1_data["Typical Applications in Neuroscience"].append(neuroscience_applications.get(arch, "Various visual processes"))

# Add Multimodal as a separate entry
if multimodal_count > 0:
    table1_data["Architecture Type"].append("Multimodal")
    table1_data["Count"].append(multimodal_count)
    table1_data["Examples"].append(get_example_models("Multimodal"))
    table1_data["Typical Applications in Neuroscience"].append(neuroscience_applications["Multimodal"])

table1 = pd.DataFrame(table1_data)

# Generate Table 2: Training Paradigm Distribution
table2_data = {
    "Training Paradigm": [
        "Supervised", 
        "Self-supervised", 
        "Multimodal (CLIP)",
        "Task-specific"
    ],
    "Dataset Examples": [
        "ImageNet, COCO",
        "SimCLR, MoCo, SwAV",
        "CLIP (image-text)",
        "Object detection, segmentation"
    ],
    "Count": [
        paradigm_counts["Supervised"],
        self_supervised_count,
        paradigm_counts["CLIP"],
        task_counts["Object Detection"] + segmentation_count
    ],
    "Neuroscientific Relevance": [
        "Comparison to human category learning",
        "Unsupervised feature representation similar to developmental processes",
        "Cross-modal integration studies",
        "Specialized visual processing pathways"
    ]
}

# Add individual self-supervised methods with non-zero counts
for method in ss_methods:
    method_key = method if method != "Deepcluster" else "Deepcluster"  # Handle naming inconsistency
    if paradigm_counts.get(method_key, 0) > 0:
        table2_data["Training Paradigm"].append(f"  {method}")  # Indented to show hierarchy
        table2_data["Dataset Examples"].append("")
        table2_data["Count"].append(paradigm_counts[method_key])
        table2_data["Neuroscientific Relevance"].append("")

table2 = pd.DataFrame(table2_data)

# Generate Table 3: Task-Based Selection Criteria
table3_data = {
    "Task Category": [],
    "# Models": [],
    "Key Architectures": [],
    "Neuroscientific Application": []
}



task_neuroscience_mapping = {
    "Image Classification": "Ventral visual stream, object recognition",
    "Object Detection": "Attentional mechanisms, object localization",
    "Segmentation": "Scene parsing, figure-ground segregation",
    "Video Classification": "Motion processing, temporal integration",
    "Audio Tagging": "Auditory processing pathways",
    "Keypoint Detection": "Biological motion perception",
    "Pose Estimation": "Action recognition, motor system modeling",
    "Natural Language Processing":"?",
    "Various visual tasks":"other"
    
}

# Add all tasks with non-zero counts
for task, count in task_counts.items():
    if count > 0:
        table3_data["Task Category"].append(task)
        table3_data["# Models"].append(count)
        table3_data["Key Architectures"].append(get_example_models(task))
        
        # Map to neuroscience application
        if task in ["Panoptic Segmentation", "Semantic Segmentation", "Instance Segmentation"]:
            neuro_app = task_neuroscience_mapping["Segmentation"]
        else:
            neuro_app = task_neuroscience_mapping.get(task, "Various neural processes")
            
        table3_data["Neuroscientific Application"].append(neuro_app)

# Add combined segmentation entry if it has a count
if segmentation_count > 0:
    table3_data["Task Category"].append("All Segmentation Tasks")
    table3_data["# Models"].append(segmentation_count)
    
    # Get examples from different segmentation types
    seg_examples = []
    for seg_type in ["Panoptic Segmentation", "Semantic Segmentation", "Instance Segmentation"]:
        examples = get_example_models(seg_type)
        if examples != "N/A":
            seg_examples.append(examples)
    
    table3_data["Key Architectures"].append(", ".join(seg_examples) if seg_examples else "Mask R-CNN, DeepLab, UNet")
    table3_data["Neuroscientific Application"].append(task_neuroscience_mapping["Segmentation"])

table3 = pd.DataFrame(table3_data)

# Print tables
print("\n===== TABLE 1: Hierarchical Summary Table =====")
print(table1.to_string(index=False))

print("\n\n===== TABLE 2: Training Paradigm Distribution =====")
print(table2.to_string(index=False))

print("\n\n===== TABLE 3: Task-Based Selection Criteria =====")
print(table3.to_string(index=False))

# Generate LaTeX tables
def generate_latex_table(df, caption):
    if len(df) == 0:
        return f"% Empty table for {caption}"
    
    latex = "\\begin{table}[htbp]\n\\centering\n"
    latex += f"\\caption{{{caption}}}\n"
    latex += "\\begin{tabular}{" + "l" * len(df.columns) + "}\n"
    latex += "\\toprule\n"
    
    # Headers
    latex += " & ".join(df.columns) + " \\\\\n"
    latex += "\\midrule\n"
    
    # Rows
    for _, row in df.iterrows():
        latex += " & ".join([str(val) for val in row.values]) + " \\\\\n"
    
    latex += "\\bottomrule\n"
    latex += "\\end{tabular}\n"
    latex += "\\end{table}\n"
    
    return latex

latex1 = generate_latex_table(table1, "Distribution of models by architecture type")
latex2 = generate_latex_table(table2, "Distribution of models by training paradigm")
latex3 = generate_latex_table(table3, "Distribution of models by task category")

print("\n\n===== LATEX TABLE 1 =====")
print(latex1)

print("\n===== LATEX TABLE 2 =====")
print(latex2)

print("\n===== LATEX TABLE 3 =====")
print(latex3)

=== ARCHITECTURE COUNTS ===
CNN: 522
Transformer-based: 13
Vision Transformer: 80
Swin-Transformer: 10
MLP-Mixer: 25
Multimodal: 5

=== TASK COUNTS ===
Image Classification: 520
Object Detection: 32
Panoptic Segmentation: 4
Semantic Segmentation: 1
Instance Segmentation: 24
Keypoint Detection: 4
Pose Estimation: 0
Video Classification: 6
Natural Language Processing: 6
Audio Tagging: 29
Various visual tasks: 23
Combined Segmentation: 29

=== TRAINING PARADIGM COUNTS ===
Supervised: 600
Jigsaw: 7
NPID: 2
RotNet: 2
Clusterfit: 1
Deepcluster: 3
SimCLR: 5
SwAV: 7
MoCo: 1
CLIP: 5
Combined Self-supervised: 28

Total models: 649

===== TABLE 1: Hierarchical Summary Table =====
 Architecture Type  Count                                                                                        Examples                     Typical Applications in Neuroscience
               CNN    522                                                                     AlexNet, ResNet18, ResNet34               Visual 

In [3]:
import pandas as pd
import numpy as np

# Read the CSV file
data = pd.read_csv('net2brain/architectures/taxonomy.csv')
# Identify where actual data begins (after header rows)
data_start = 1  # Based on your sample, data starts from row index 1

# Get the actual data rows
model_data = data.iloc[data_start:].copy()

# Function to check if a value represents an 'x' marking
def is_marked(val):
    if pd.isna(val):
        return False
    return str(val).lower() == 'x'

# Count total models
total_models = len(model_data)
print(f"Total models in taxonomy: {total_models}")

# ARCHITECTURE ANALYSIS
print("\n=== ARCHITECTURE ANALYSIS ===")

# Create a dictionary to track which models belong to each architecture type
arch_models = {
    "CNN": [],
    "Transformer-based": [],
    "Vision Transformer": [],
    "Swin-Transformer": [],
    "MLP-Mixer": [],
    "Multimodal": []
}

# Collect models for each architecture type
for idx, row in model_data.iterrows():
    model_name = row['Model']
    
    if is_marked(row['Convolutional Neural Network']):
        arch_models["CNN"].append(model_name)
    
    if is_marked(row['Transformer-based Models']):
        arch_models["Transformer-based"].append(model_name)
    
    if is_marked(row['Vision Transformer']):
        arch_models["Vision Transformer"].append(model_name)
    
    if is_marked(row['Swin-Transformer']):
        arch_models["Swin-Transformer"].append(model_name)
    
    if is_marked(row['MLP-Mixer']):
        arch_models["MLP-Mixer"].append(model_name)
    
    if is_marked(row['Multimodal']):
        arch_models["Multimodal"].append(model_name)

# Print architecture model counts
for arch, models in arch_models.items():
    print(f"{arch}: {len(models)} models")

# Find models with multiple architecture labels
all_arch_models = set()
multi_arch_models = set()

for arch, models in arch_models.items():
    for model in models:
        if model in all_arch_models:
            multi_arch_models.add(model)
        all_arch_models.add(model)

# Calculate models with architecture labels and models without
models_with_arch = len(all_arch_models)
models_without_arch = total_models - models_with_arch

print(f"\nModels with multiple architecture labels: {len(multi_arch_models)}")
print(f"Models with at least one architecture label: {models_with_arch}")
print(f"Models without any architecture label: {models_without_arch}")

# Print examples of models with multiple architecture labels
if multi_arch_models:
    print("\nExamples of models with multiple architecture labels:")
    for model in list(multi_arch_models)[:10]:  # Show up to 10 examples
        archs = [arch for arch, models in arch_models.items() if model in models]
        print(f"  - {model}: {', '.join(archs)}")

# Print examples of models without architecture labels
if models_without_arch > 0:
    models_without_arch_list = [row['Model'] for idx, row in model_data.iterrows() 
                               if row['Model'] not in all_arch_models]
    print("\nExamples of models without architecture labels:")
    for model in models_without_arch_list:  # Show up to 10 examples
        print(f"  - {model}")

# TASK ANALYSIS
print("\n\n=== TASK ANALYSIS ===")

# Create a dictionary to track which models belong to each task
task_models = {
    "Image Classification": [],
    "Various visual tasks": [],
    "Object Detection": [],
    "Panoptic Segmentation": [],
    "Semantic Segmentation": [],
    "Instance Segmentation": [],
    "Keypoint Detection": [],
    "Pose Estimation": [],
    "Video Classification": [],
    "Natural Language Processing": [],
    "Audio Tagging": []
}

# Collect models for each task
for idx, row in model_data.iterrows():
    model_name = row['Model']
    
    if is_marked(row['Image Classification']):
        task_models["Image Classification"].append(model_name)
    
    if is_marked(row['Object Detection']):
        task_models["Object Detection"].append(model_name)
    
    if is_marked(row['Panoptic Segmentation']):
        task_models["Panoptic Segmentation"].append(model_name)
    
    if is_marked(row['Semantic Segmentation']):
        task_models["Semantic Segmentation"].append(model_name)
    
    if is_marked(row['Instance Segmentation']):
        task_models["Instance Segmentation"].append(model_name)
    
    if is_marked(row['Keypoint Detection']):
        task_models["Keypoint Detection"].append(model_name)
    
    if is_marked(row['Pose Estimation']):
        task_models["Pose Estimation"].append(model_name)
    
    if is_marked(row['Video Classification']):
        task_models["Video Classification"].append(model_name)
    
    if is_marked(row['Audio Tagging']):
        task_models["Audio Tagging"].append(model_name)
        
    if is_marked(row["Various visual tasks"]):
        task_models["Various visual tasks"].append(model_name)
        
    if is_marked(row["Natural Language Processing"]):
        task_models["Natural Language Processing"].append(model_name)

# Print task model counts
for task, models in task_models.items():
    print(f"{task}: {len(models)} models")

# Find models with multiple task labels
all_task_models = set()
multi_task_models = set()

for task, models in task_models.items():
    for model in models:
        if model in all_task_models:
            multi_task_models.add(model)
        all_task_models.add(model)

# Calculate models with task labels and models without
models_with_task = len(all_task_models)
models_without_task = total_models - models_with_task

print(f"\nModels with multiple task labels: {len(multi_task_models)}")
print(f"Models with at least one task label: {models_with_task}")
print(f"Models without any task label: {models_without_task}")

# Print examples of models with multiple task labels
if multi_task_models:
    print("\nExamples of models with multiple task labels:")
    for model in list(multi_task_models)[:10]:  # Show up to 10 examples
        tasks = [task for task, models in task_models.items() if model in models]
        print(f"  - {model}: {', '.join(tasks)}")

# Print examples of models without task labels
if models_without_task > 0:
    models_without_task_list = [row['Model'] for idx, row in model_data.iterrows() 
                               if row['Model'] not in all_task_models]
    print("\nExamples of models without task labels:")
    for model in models_without_task_list:  # Show up to 10 examples
        print(f"  - {model}")

# SEGMENTATION ANALYSIS
print("\n\n=== SEGMENTATION TASKS ANALYSIS ===")

# Create sets for each segmentation task
panoptic_models = set(task_models["Panoptic Segmentation"])
semantic_models = set(task_models["Semantic Segmentation"])
instance_models = set(task_models["Instance Segmentation"])

# Find the combined set (models that do any segmentation task)
all_seg_models = panoptic_models.union(semantic_models).union(instance_models)

# Check for models that appear in multiple segmentation categories
multi_seg_models = set()

for model in panoptic_models:
    if model in semantic_models or model in instance_models:
        multi_seg_models.add(model)

for model in semantic_models:
    if model in instance_models:
        multi_seg_models.add(model)

print(f"Panoptic Segmentation models: {len(panoptic_models)}")
print(f"Semantic Segmentation models: {len(semantic_models)}")
print(f"Instance Segmentation models: {len(instance_models)}")
print(f"Combined unique segmentation models: {len(all_seg_models)}")
print(f"Models in multiple segmentation categories: {len(multi_seg_models)}")

if multi_seg_models:
    print("\nModels that appear in multiple segmentation categories:")
    for model in multi_seg_models:
        categories = []
        if model in panoptic_models:
            categories.append("Panoptic")
        if model in semantic_models:
            categories.append("Semantic")
        if model in instance_models:
            categories.append("Instance")
        print(f"  - {model}: {', '.join(categories)}")

# CROSS-ANALYSIS BETWEEN ARCHITECTURE AND TASK
print("\n\n=== CROSS-ANALYSIS: ARCHITECTURE VS TASK ===")

# Find models that have architecture label but no task label
arch_but_no_task = all_arch_models - all_task_models
if arch_but_no_task:
    print(f"Models with architecture label but no task label: {len(arch_but_no_task)}")
    print("Examples:")
    for model in list(arch_but_no_task)[:10]:
        archs = [arch for arch, models in arch_models.items() if model in models]
        print(f"  - {model}: {', '.join(archs)}")

# Find models that have task label but no architecture label
task_but_no_arch = all_task_models - all_arch_models
if task_but_no_arch:
    print(f"\nModels with task label but no architecture label: {len(task_but_no_arch)}")
    print("Examples:")
    for model in list(task_but_no_arch)[:10]:
        tasks = [task for task, models in task_models.items() if model in models]
        print(f"  - {model}: {', '.join(tasks)}")

# Find models with both architecture and task labels
models_with_both = all_arch_models.intersection(all_task_models)
print(f"\nModels with both architecture and task labels: {len(models_with_both)}")

# Find models without any labels
models_without_any = set(model_data['Model']) - all_arch_models - all_task_models
if models_without_any:
    print(f"\nModels without any architecture or task labels: {len(models_without_any)}")
    print("Examples:")
    for model in list(models_without_any)[:10]:
        print(f"  - {model}")

Total models in taxonomy: 649

=== ARCHITECTURE ANALYSIS ===
CNN: 522 models
Transformer-based: 13 models
Vision Transformer: 80 models
Swin-Transformer: 10 models
MLP-Mixer: 25 models
Multimodal: 5 models

Models with multiple architecture labels: 15
Models with at least one architecture label: 640
Models without any architecture label: 9

Examples of models with multiple architecture labels:
  - efficientnet_b4: CNN
  - efficientnet_b0: CNN
  - efficientnet_b1: CNN
  - wide_resnet101_2: CNN
  - RN50: CNN, Multimodal
  - ViT-B_-_16: Vision Transformer, Multimodal
  - resnext50_32x4d: CNN
  - resnext101_32x8d: CNN
  - ViT-L_-_14: Vision Transformer, Multimodal
  - adv_inception_v3: CNN, Vision Transformer

Examples of models without architecture labels:


=== TASK ANALYSIS ===
Image Classification: 520 models
Various visual tasks: 23 models
Object Detection: 32 models
Panoptic Segmentation: 4 models
Semantic Segmentation: 1 models
Instance Segmentation: 24 models
Keypoint Detection: 4 