# Morphosyntax Constraint Audit: Content-Scrambled Control

**Goal**: Test whether content sequencing matters when function skeleton is preserved.

**Conditions**:
1. **SENTENCE**: Real English ("the scientist decided to study...")
2. **JABBERWOCKY**: Nonsense words, preserved structure ("the prell decided to cleb...")
3. **CONTENT_SCRAMBLED**: Same function skeleton, shuffled content ("the cleb decided to braz...")

**Key comparison**: JABBERWOCKY vs CONTENT_SCRAMBLED (matched n=30/30 "to" instances)

**Prediction**:
- If **sequencing matters**: JABBERWOCKY > CONTENT_SCRAMBLED
- If **only skeleton matters**: JABBERWOCKY ≈ CONTENT_SCRAMBLED

In [None]:
# Upload stimuli_content_scrambled.json before running!
import json
import torch
import spacy
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from scipy import stats

# Install spaCy model if needed
try:
    nlp = spacy.load("en_core_web_sm")
except:
    !python -m spacy download en_core_web_sm
    nlp = spacy.load("en_core_web_sm")

print("✓ Libraries loaded")

In [None]:
# Load GPT-2
print("Loading GPT-2...")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()
print("✓ GPT-2 loaded")

In [None]:
# Load stimuli
with open('stimuli_content_scrambled.json', 'r') as f:
    stimuli = json.load(f)

print(f"✓ Loaded {len(stimuli)} stimulus sets")
print(f"\nExample (Set 1):")
print(f"  Sentence:     {stimuli[0]['sentence']}")
print(f"  Jabberwocky:  {stimuli[0]['jabberwocky_matched']}")
print(f"  Content-scrambled: {stimuli[0]['content_scrambled_jabberwocky']}")

In [None]:
# VERB lexicon (same as refined audit)
VERB_SET = {
    # Common verbs
    'be', 'have', 'do', 'say', 'go', 'get', 'make', 'know', 'think', 'take',
    'see', 'come', 'want', 'use', 'find', 'give', 'tell', 'work', 'call', 'try',
    'ask', 'need', 'feel', 'become', 'leave', 'put', 'mean', 'keep', 'let', 'begin',
    'seem', 'help', 'show', 'hear', 'play', 'run', 'move', 'like', 'live', 'believe',
    'bring', 'happen', 'write', 'sit', 'stand', 'lose', 'pay', 'meet', 'include', 'continue',
    'set', 'learn', 'change', 'lead', 'understand', 'watch', 'follow', 'stop', 'create', 'speak',
    'read', 'allow', 'add', 'spend', 'grow', 'open', 'walk', 'win', 'teach', 'offer',
    'remember', 'love', 'consider', 'appear', 'buy', 'serve', 'die', 'send', 'build', 'stay',
    'fall', 'cut', 'reach', 'kill', 'raise', 'pass', 'sell', 'decide', 'return', 'explain',
    'hope', 'develop', 'carry', 'break', 'receive', 'agree', 'support', 'hit', 'produce', 'eat',
    # Additional verbs from templates
    'study', 'research', 'investigate', 'examine', 'analyze', 'explore',
    'paint', 'draw', 'create', 'design', 'build', 'construct',
    'play', 'perform', 'practice', 'rehearse',
    'publish', 'write', 'edit', 'revise',
    'prepare', 'cook', 'bake',
    'repair', 'fix', 'mend',
    'solve', 'calculate', 'compute',
    'improve', 'enhance', 'upgrade',
    'debug', 'test', 'validate',
    'organize', 'arrange', 'sort',
    'defend', 'protect', 'guard',
    'film', 'record', 'capture',
    'sail', 'navigate', 'steer',
    'discuss', 'debate', 'argue',
    'assemble', 'combine', 'join',
    'refine', 'polish', 'perfect',
    'plan', 'schedule', 'arrange',
    'finish', 'complete', 'conclude',
}

print(f"✓ VERB set: {len(VERB_SET)} verbs")

In [None]:
def get_verb_mass_after_to(text, model, tokenizer, nlp, verb_set):
    """
    Compute VERB probability mass after infinitival 'to'.
    
    Returns:
        to_instances: list of dicts with verb_mass for each 'to' occurrence
    """
    # Parse with spaCy
    doc = nlp(text)
    words = text.split()
    
    to_instances = []
    
    # Find all "to" tokens
    for i, word in enumerate(words):
        if word.lower() != 'to':
            continue
        
        # Build context up to and including "to"
        context = ' '.join(words[:i+1])
        
        # Tokenize context
        inputs = tokenizer(context, return_tensors='pt')
        
        # Get next-token distribution
        with torch.no_grad():
            outputs = model(**inputs)
        
        logits = outputs.logits[0, -1, :]
        probs = torch.softmax(logits, dim=-1)
        
        # Get top-100 predictions
        top_k_probs, top_k_ids = torch.topk(probs, 100)
        
        # Compute VERB mass
        verb_mass = 0.0
        
        for prob, token_id in zip(top_k_probs, top_k_ids):
            token_str = tokenizer.decode([token_id]).strip().lower()
            
            # Check if word-start token (starts with space in GPT-2)
            raw_token = tokenizer.decode([token_id])
            if not raw_token.startswith(' '):
                continue
            
            # Check if VERB
            if token_str in verb_set:
                verb_mass += prob.item()
        
        to_instances.append({
            'to_word_index': i,
            'context': context,
            'verb_mass': verb_mass,
            'num_context_tokens': len(inputs['input_ids'][0])
        })
    
    return to_instances

print("✓ Analysis function defined")

In [None]:
# Run audit on all 3 conditions
results = []

conditions = [
    ('sentence', 'SENTENCE'),
    ('jabberwocky_matched', 'JABBERWOCKY'),
    ('content_scrambled_jabberwocky', 'CONTENT_SCRAMBLED')
]

print("Running morphosyntax audit...\n")

for set_idx, stim_set in enumerate(stimuli):
    set_id = stim_set['set_id']
    
    if (set_idx + 1) % 5 == 0:
        print(f"Processing set {set_idx + 1}/30...")
    
    for cond_key, cond_name in conditions:
        text = stim_set[cond_key]
        
        # Get VERB mass for all "to" instances
        to_instances = get_verb_mass_after_to(text, model, tokenizer, nlp, VERB_SET)
        
        # Record each instance
        for instance in to_instances:
            results.append({
                'set_id': set_id,
                'condition': cond_name,
                'text': text,
                'to_word_index': instance['to_word_index'],
                'context': instance['context'],
                'verb_mass': instance['verb_mass'],
                'num_context_tokens': instance['num_context_tokens']
            })

print(f"\n✓ Processed {len(stimuli)} stimulus sets")
print(f"✓ Total instances: {len(results)}")

# Count per condition
for _, cond_name in conditions:
    n = sum(1 for r in results if r['condition'] == cond_name)
    print(f"  {cond_name}: n={n}")

In [None]:
# Save results
output_file = 'morphosyntax_audit_content_scrambled_results.json'
with open(output_file, 'w') as f:
    json.dump(results, f, indent=2)

print(f"✓ Saved to: {output_file}")

In [None]:
# Aggregate per sentence set
from collections import defaultdict

# Group by set_id and condition
aggregated = defaultdict(lambda: defaultdict(list))

for r in results:
    aggregated[r['set_id']][r['condition']].append(r['verb_mass'])

# Compute means
summary_data = []

for set_id in sorted(aggregated.keys()):
    row = {'set_id': set_id}
    
    for _, cond_name in conditions:
        if cond_name in aggregated[set_id]:
            row[cond_name] = np.mean(aggregated[set_id][cond_name])
        else:
            row[cond_name] = np.nan
    
    summary_data.append(row)

print("✓ Aggregated per sentence set")

In [None]:
# Statistical analysis
import pandas as pd

df = pd.DataFrame(summary_data)

print("=" * 80)
print("RESULTS SUMMARY")
print("=" * 80)
print()

# Descriptive statistics
for _, cond_name in conditions:
    values = df[cond_name].dropna()
    print(f"{cond_name}:")
    print(f"  n = {len(values)}")
    print(f"  Mean = {values.mean():.4f}")
    print(f"  SD = {values.std():.4f}")
    print(f"  Range = [{values.min():.4f}, {values.max():.4f}]")
    print()

# Primary comparison: JABBERWOCKY vs CONTENT_SCRAMBLED (paired)
jab_values = df['JABBERWOCKY'].dropna()
scram_values = df['CONTENT_SCRAMBLED'].dropna()

# Ensure matched pairs
matched_df = df[['JABBERWOCKY', 'CONTENT_SCRAMBLED']].dropna()
jab_matched = matched_df['JABBERWOCKY'].values
scram_matched = matched_df['CONTENT_SCRAMBLED'].values

# Paired t-test
t_stat, p_value = stats.ttest_rel(jab_matched, scram_matched)

# Cohen's d (paired)
diff = jab_matched - scram_matched
d = diff.mean() / diff.std()

print("=" * 80)
print("PRIMARY COMPARISON: JABBERWOCKY vs CONTENT_SCRAMBLED")
print("=" * 80)
print()
print(f"n (matched pairs): {len(jab_matched)}")
print(f"JABBERWOCKY:      {jab_matched.mean():.4f} ± {jab_matched.std():.4f}")
print(f"CONTENT_SCRAMBLED: {scram_matched.mean():.4f} ± {scram_matched.std():.4f}")
print()
print(f"Δ (Jab - Scram):  {diff.mean():.4f} ({diff.mean()*100:.1f}% VERB mass)")
print()
print(f"Paired t-test:")
print(f"  t({len(diff)-1}) = {t_stat:.3f}")
print(f"  p = {p_value:.4f}")
print(f"  Cohen's d = {d:.3f}")
print()

# Interpretation
if p_value < 0.001:
    sig = "***"
elif p_value < 0.01:
    sig = "**"
elif p_value < 0.05:
    sig = "*"
else:
    sig = "n.s."

print(f"Significance: {sig}")
print()

if abs(d) < 0.2:
    effect = "negligible"
elif abs(d) < 0.5:
    effect = "small"
elif abs(d) < 0.8:
    effect = "medium"
else:
    effect = "large"

print(f"Effect size: {effect}")
print()

# Interpretation
print("=" * 80)
print("INTERPRETATION")
print("=" * 80)
print()

if diff.mean() > 0.02 and p_value < 0.05:
    print("✓ JABBERWOCKY > CONTENT_SCRAMBLED")
    print("  → Content SEQUENCING matters (not just function skeleton)")
    print("  → Model is sensitive to linear order of content words")
elif diff.mean() < -0.02 and p_value < 0.05:
    print("✓ CONTENT_SCRAMBLED > JABBERWOCKY")
    print("  → Unexpected! Scrambling IMPROVES predictions?")
    print("  → May indicate model exploits accidental regularities")
else:
    print("✓ JABBERWOCKY ≈ CONTENT_SCRAMBLED")
    print("  → Content sequencing does NOT matter")
    print("  → Only function-word SKELETON drives predictions")
    print("  → Supports purely structural (not sequential) constraint")

print()
print("=" * 80)

In [None]:
# Create paired dot plot
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 6))

# Plot individual pairs
for i in range(len(jab_matched)):
    ax.plot([1, 2], [jab_matched[i], scram_matched[i]], 
            'o-', color='gray', alpha=0.3, linewidth=0.5)

# Plot means
ax.plot([1, 2], [jab_matched.mean(), scram_matched.mean()], 
        'o-', color='red', linewidth=3, markersize=12, label='Mean')

# Labels
ax.set_xlim(0.5, 2.5)
ax.set_xticks([1, 2])
ax.set_xticklabels(['Jabberwocky', 'Content-Scrambled'])
ax.set_ylabel('VERB Probability Mass', fontsize=12)
ax.set_title(f'VERB Mass After "to": Sequencing Effect\n'
             f'Δ = {diff.mean():.3f}, t({len(diff)-1}) = {t_stat:.2f}, p = {p_value:.4f}',
             fontsize=14, fontweight='bold')

ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('morphosyntax_content_scrambled_paired_plot.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Saved plot to: morphosyntax_content_scrambled_paired_plot.png")

In [None]:
# Export summary CSV for paper
df.to_csv('morphosyntax_content_scrambled_summary.csv', index=False)
print("✓ Saved summary to: morphosyntax_content_scrambled_summary.csv")

print("\n" + "=" * 80)
print("AUDIT COMPLETE!")
print("=" * 80)
print("\nDownload these files:")
print("  1. morphosyntax_audit_content_scrambled_results.json")
print("  2. morphosyntax_content_scrambled_summary.csv")
print("  3. morphosyntax_content_scrambled_paired_plot.png")