In [3]:
# =============================================================================
# SCRIPT: PHASE 7.1 - Multilingual Syntactic Analysis (X-UUAS)
#
# DESCRIPTION:
# This script adapts the Phase 0 UUAS analysis to run on a multilingual
# model (mBERT) against two different languages (English and Hindi) to
# identify language-neutral vs. language-specific syntactic heads,
# as described in the planOfAction.pdf.
# =============================================================================

print("─" * 80)
print("PHASE 7.1: Multilingual Syntactic Analysis (X-UUAS)")
print("─" * 80)

# --- 1. Environment Setup & Imports ---
print("STEP 1: Installing all required packages...")
import subprocess
import sys
import os

def install(package):
    """Installs a package using pip."""
    print(f"Installing {package}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

try:
    install("transformers")
    install("datasets")
    install("accelerate")
    install("stanza")
    install("numpy")
    install("pandas")
    install("conllu")
    print("All packages installed successfully.")
except Exception as e:
    print(f"An error occurred during installation: {e}")
    raise e

# --- 2. Import Libraries (AFTER installation) ---
print("\nSTEP 2: Importing libraries...")
try:
    import torch
    import numpy as np
    import pandas as pd
    import stanza
    import os
    import warnings
    from tqdm.notebook import tqdm
    from google.colab import files
    from transformers import (
        BertTokenizerFast,
        BertModel
    )
    import requests
    from conllu import parse
    print("Libraries imported successfully.")
except ImportError as e:
    print(f"A library failed to import: {e}")
    print("Please ensure all packages were installed correctly.")
    raise e

# Filter warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# --- 3. Download Stanza Models ---
print("\nSTEP 3: Downloading Stanza models...")
try:
    print("Downloading Stanza English ('en') model...")
    stanza.download('en', verbose=False)
    print("Downloading Stanza Hindi ('hi') model...")
    stanza.download('hi', verbose=False)
    print("Stanza models downloaded successfully!")
except Exception as e:
    print(f"An error occurred during Stanza model download: {e}")

# --- 4. Define Constants and Helper Functions ---
print("\nSTEP 4: Defining helper functions...")

MODEL_NAME = 'bert-base-multilingual-cased'
SAMPLE_SIZE = 1000 # Use a sample for faster analysis. Increase for more robust results.

# --- Functions from your AttentionHeads.ipynb (for UUAS) ---
def create_offset_based_map(sentence_text, tokenizer, stanza_doc):
    """Creates a robust alignment map between Stanza words and BERT tokens."""
    stanza_word_offsets = []
    original_words = []
    for sentence in stanza_doc.sentences:
        for word in sentence.words:
            original_words.append(word.text)
            stanza_word_offsets.append((word.start_char, word.end_char))

    encoding = tokenizer(sentence_text, return_offsets_mapping=True, return_tensors="pt")
    bert_token_offsets = encoding['offset_mapping'][0]

    token_to_word_map = []
    for token_start, token_end in bert_token_offsets:
        if token_start == 0 and token_end == 0:
            token_to_word_map.append(-1)
            continue
        found = False
        for word_idx, (word_start, word_end) in enumerate(stanza_word_offsets):
            if token_start >= word_start and token_end <= word_end:
                token_to_word_map.append(word_idx)
                found = True
                break
        if not found:
            token_to_word_map.append(-1)

    return token_to_word_map, encoding

def extract_gold_dependency_pairs_from_stanza(doc):
    """Extracts a set of (head, dependent) index pairs from a Stanza doc."""
    gold_pairs = set()
    offset = 0
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.head > 0:
                head_index = word.head - 1 + offset
                dependent_index = word.id - 1 + offset
                gold_pairs.add(tuple(sorted((head_index, dependent_index))))
        offset += len(sentence.words)
    return gold_pairs

def calculate_uuas_for_head(attention_matrix, token_to_word_map, gold_pairs):
    """Calculates the UUAS for a single head on a single sentence."""
    if not gold_pairs:
        return 0.0, 0
    correct_predictions = 0
    total_words = 0
    num_tokens = len(token_to_word_map)
    word_indices = sorted(list(set(idx for idx in token_to_word_map if idx != -1)))

    for word_idx in word_indices:
        total_words += 1
        source_token_indices = [i for i, w_idx in enumerate(token_to_word_map) if w_idx == word_idx]
        aggregated_attention = attention_matrix[source_token_indices, :].mean(axis=0)
        valid_targets_mask = np.array([1 if token_to_word_map[i] not in [-1, word_idx] else 0 for i in range(num_tokens)])
        if np.sum(valid_targets_mask) == 0:
            continue
        max_attention_idx = np.argmax(aggregated_attention * valid_targets_mask)
        predicted_head_word_idx = token_to_word_map[max_attention_idx]
        predicted_pair = tuple(sorted((word_idx, predicted_head_word_idx)))
        if predicted_pair in gold_pairs:
            correct_predictions += 1
    uuas = correct_predictions / total_words if total_words > 0 else 0.0
    return uuas, total_words

def run_uuas_analysis(model, tokenizer, nlp_pipeline, dataset, text_column):
    """
    Runs the full UUAS analysis loop from the notebook on a given model,
    pipeline, and dataset.
    """
    device = model.device
    model.eval()
    head_scores = {
        (l, h): []
        for l in range(model.config.num_hidden_layers)
        for h in range(model.config.num_attention_heads)
    }

    desc = f"Running UUAS Analysis on {nlp_pipeline.lang} ({len(dataset)} samples)"
    for example in tqdm(dataset, desc=desc):
        sentence_text = example[text_column]
        if not sentence_text: continue

        doc = nlp_pipeline(sentence_text)
        gold_dependencies = extract_gold_dependency_pairs_from_stanza(doc)
        token_to_word_map, encoding = create_offset_based_map(sentence_text, tokenizer, doc)

        model_inputs = {k: v.to(device) for k, v in encoding.items() if k != 'offset_mapping'}

        with torch.no_grad():
            outputs = model(**model_inputs)

        attentions = outputs.attentions

        for layer_idx in range(model.config.num_hidden_layers):
            for head_idx in range(model.config.num_attention_heads):
                attention_matrix = attentions[layer_idx][0, head_idx].cpu().numpy()
                uuas, num_words = calculate_uuas_for_head(attention_matrix, token_to_word_map, gold_dependencies)
                if num_words > 0:
                    head_scores[(layer_idx, head_idx)].append(uuas)

    # Aggregate results
    results_data = []
    for (layer, head), scores in head_scores.items():
        avg_score = np.mean(scores) if scores else 0
        results_data.append({"Layer": layer, "Head": head, "UUAS": avg_score})

    return pd.DataFrame(results_data)

print("All helper functions defined.")


# --- 5. Load Model and Tokenizer ---
print("\nSTEP 5: Loading mBERT model and tokenizer...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME, output_attentions=True)
model.to(device)
model.eval()
print(f"Model '{MODEL_NAME}' loaded successfully.")


# --- 6. Run Analysis for English ---
print("\n" + "─" * 80)
print("STEP 6: Running X-UUAS Analysis for English (en_ewt)...")
print("─" * 80)

# Initialize English pipeline
nlp_en = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse', verbose=False)

# Download and load English dev data from GitHub
print("Downloading English UD dev data...")
en_dev_url = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-dev.conllu"
response = requests.get(en_dev_url)
if response.status_code == 200:
    en_conllu = response.text
    en_sentences = list(parse(en_conllu))
    data_en = [{"text": " ".join([token['form'] for token in sent])} for sent in en_sentences[:SAMPLE_SIZE]]
    print(f"Loaded {len(data_en)} English sentences.")
else:
    print(f"Failed to download English data: {response.status_code}")
    raise ValueError("Could not download English dataset")

# Run analysis
df_en_uuas = run_uuas_analysis(model, tokenizer, nlp_en, data_en, "text")
df_en_uuas = df_en_uuas.rename(columns={'UUAS': 'UUAS_English'})

print("\nEnglish analysis complete.")
print(df_en_uuas.sort_values(by="UUAS_English", ascending=False).head(5).to_string(index=False))


# --- 7. Run Analysis for Hindi ---
print("\n" + "─" * 80)
print("STEP 7: Running X-UUAS Analysis for Hindi (hi_hdtb)...")
print("─" * 80)

# Initialize Hindi pipeline
nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos,lemma,depparse', verbose=False)

# Download and load Hindi dev data from GitHub
print("Downloading Hindi UD dev data...")
hi_dev_url = "https://raw.githubusercontent.com/UniversalDependencies/UD_Hindi-HDTB/master/hi_hdtb-ud-dev.conllu"
response = requests.get(hi_dev_url)
if response.status_code == 200:
    hi_conllu = response.text
    hi_sentences = list(parse(hi_conllu))
    data_hi = [{"text": " ".join([token['form'] for token in sent])} for sent in hi_sentences[:SAMPLE_SIZE]]
    print(f"Loaded {len(data_hi)} Hindi sentences.")
else:
    print(f"Failed to download Hindi data: {response.status_code}")
    raise ValueError("Could not download Hindi dataset")

# Run analysis
df_hi_uuas = run_uuas_analysis(model, tokenizer, nlp_hi, data_hi, "text")
df_hi_uuas = df_hi_uuas.rename(columns={'UUAS': 'UUAS_Hindi'})

print("\nHindi analysis complete.")
print(df_hi_uuas.sort_values(by="UUAS_Hindi", ascending=False).head(5).to_string(index=False))


# --- 8. Synthesize and Report Findings ---
print("\n" + "─" * 80)
print("STEP 8: Synthesizing results and identifying specialists...")
print("─" * 80)

# Merge the two dataframes
df_x_uuas = pd.merge(df_en_uuas, df_hi_uuas, on=['Layer', 'Head'])

# Calculate the difference (positive = more English-aligned, negative = more Hindi-aligned)
df_x_uuas['UUAS_Diff'] = df_x_uuas['UUAS_English'] - df_x_uuas['UUAS_Hindi']
# Calculate a combined score for language-neutral heads
df_x_uuas['UUAS_Combined'] = df_x_uuas['UUAS_English'] + df_x_uuas['UUAS_Hindi']


# Finding 1: Language-Neutral Syntax Heads
print("\n--- FINDING 1: Language-Neutral Heads (High UUAS in both) ---")
print(df_x_uuas.sort_values(by='UUAS_Combined', ascending=False).head(10).to_string(index=False, float_format="%.4f"))

# Finding 2: English-Specific Syntax Heads
print("\n--- FINDING 2: English-Specific Heads (High English, Low Hindi) ---")
print(df_x_uuas.sort_values(by='UUAS_Diff', ascending=False).head(10).to_string(index=False, float_format="%.4f"))

# Finding 3: Hindi-Specific Syntax Heads
print("\n--- FINDING 3: Hindi-Specific Heads (High Hindi, Low English) ---")
print(df_x_uuas.sort_values(by='UUAS_Diff', ascending=True).head(10).to_string(index=False, float_format="%.4f"))


# --- 9. Save Final Report ---
print("\n" + "─" * 80)
print("STEP 9: Saving final merged report...")
print("─" * 80)

output_filename = "df_x_uuas.csv"
df_x_uuas.to_csv(output_filename, index=False, float_format="%.4f")
files.download(output_filename)

print(f"Successfully saved and downloaded '{output_filename}'.")
print("PHASE 7.1 COMPLETE")
print("─" * 80)

────────────────────────────────────────────────────────────────────────────────
PHASE 7.1: Multilingual Syntactic Analysis (X-UUAS)
────────────────────────────────────────────────────────────────────────────────
STEP 1: Installing all required packages...
Installing transformers...
Installing datasets...
Installing accelerate...
Installing stanza...
Installing numpy...
Installing pandas...
Installing conllu...
All packages installed successfully.

STEP 2: Importing libraries...
Libraries imported successfully.

STEP 3: Downloading Stanza models...
Downloading Stanza English ('en') model...
Downloading Stanza Hindi ('hi') model...
Stanza models downloaded successfully!

STEP 4: Defining helper functions...
All helper functions defined.

STEP 5: Loading mBERT model and tokenizer...
Using device: cuda
Model 'bert-base-multilingual-cased' loaded successfully.

────────────────────────────────────────────────────────────────────────────────
STEP 6: Running X-UUAS Analysis for English (en_

Running UUAS Analysis on en (1000 samples):   0%|          | 0/1000 [00:00<?, ?it/s]


English analysis complete.
 Layer  Head  UUAS_English
     5     8      0.684093
     6     8      0.596692
     8     4      0.593466
     6     3      0.584200
     8     6      0.558427

────────────────────────────────────────────────────────────────────────────────
STEP 7: Running X-UUAS Analysis for Hindi (hi_hdtb)...
────────────────────────────────────────────────────────────────────────────────
Downloading Hindi UD dev data...
Loaded 1000 Hindi sentences.


Running UUAS Analysis on hi (1000 samples):   0%|          | 0/1000 [00:00<?, ?it/s]


Hindi analysis complete.
 Layer  Head  UUAS_Hindi
     5     8    0.572247
     6     8    0.512755
     6     2    0.506839
     2     3    0.505891
     3     1    0.491130

────────────────────────────────────────────────────────────────────────────────
STEP 8: Synthesizing results and identifying specialists...
────────────────────────────────────────────────────────────────────────────────

--- FINDING 1: Language-Neutral Heads (High UUAS in both) ---
 Layer  Head  UUAS_English  UUAS_Hindi  UUAS_Diff  UUAS_Combined
     5     8        0.6841      0.5722     0.1118         1.2563
     6     8        0.5967      0.5128     0.0839         1.1094
     8     4        0.5935      0.4617     0.1318         1.0552
     6     3        0.5842      0.4214     0.1628         1.0056
     2     3        0.4960      0.5059    -0.0099         1.0019
     8     6        0.5584      0.4405     0.1179         0.9989
     3     1        0.4723      0.4911    -0.0189         0.9634
     6     2      

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Successfully saved and downloaded 'df_x_uuas.csv'.
PHASE 7.1 COMPLETE
────────────────────────────────────────────────────────────────────────────────


In [6]:
# =============================================================================
# SCRIPT: PHASE 7.2 - Code-Switching Analysis (Hinglish)
#
# DESCRIPTION:
# This script analyzes attention heads in mBERT on Hinglish code-switched sentences
# to identify "Code-Switch Heads" that pay high attention specifically to language
# boundaries, as described in the planOfAction.pdf.
# =============================================================================

print("─" * 80)
print("PHASE 7.2: Code-Switching Analysis (Hinglish)")
print("─" * 80)

# --- 1. Environment Setup & Imports ---
print("STEP 1: Installing all required packages...")
import subprocess
import sys
import os

def install(package):
    """Installs a package using pip."""
    print(f"Installing {package}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

try:
    install("transformers")
    install("accelerate")
    install("stanza")
    install("numpy")
    install("pandas")
    install("langdetect")
    print("All packages installed successfully.")
except Exception as e:
    print(f"An error occurred during installation: {e}")
    raise e

# --- 2. Import Libraries (AFTER installation) ---
print("\nSTEP 2: Importing libraries...")
try:
    import torch
    import numpy as np
    import pandas as pd
    import stanza
    import os
    import warnings
    from tqdm.notebook import tqdm
    from google.colab import files
    from transformers import (
        BertTokenizerFast,
        BertModel
    )
    from langdetect import detect
    from langdetect.lang_detect_exception import LangDetectException
    print("Libraries imported successfully.")
except ImportError as e:
    print(f"A library failed to import: {e}")
    print("Please ensure all packages were installed correctly.")
    raise e

# Filter warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# --- 3. Download Stanza Models (Optional, for potential pure sentence processing) ---
print("\nSTEP 3: Downloading Stanza models...")
try:
    print("Downloading Stanza English ('en') model...")
    stanza.download('en', verbose=False)
    print("Downloading Stanza Hindi ('hi') model...")
    stanza.download('hi', verbose=False)
    print("Stanza models downloaded successfully!")
except Exception as e:
    print(f"An error occurred during Stanza model download: {e}")

# --- 4. Define Constants and Helper Functions ---
print("\nSTEP 4: Defining helper functions...")

MODEL_NAME = 'bert-base-multilingual-cased'
SAMPLE_SIZE = 100  # Smaller sample for code-switched analysis; increase as needed

# Hardcoded sample Hinglish sentences (code-switched; Roman script for simplicity)
HINGLISH_SENTENCES = [
    "Hamaari country ke clouds is land par blessings shower karte hain.",
    "Jaldi karo guys, or we'll be late for the movie.",
    "Main office ja raha hoon, but traffic bahut hai today.",
    "Yeh book interesting lag rahi hai, but ending predictable thi.",
    "Kal party mein sab log dance kar rahe the like crazy.",
    "Mujhe coffee pasand hai, especially with extra sugar.",
    "Weather bahut hot hai outside, so AC on kar do.",
    "I am trying to learn Hindi, lekin grammar confusing hai.",
    "Weekend pe family ke saath picnic plan kar rahe hain we.",
    "This song super hit hai, everyone is singing it.",
    "Phone charge kar lo, battery low ho gayi hai.",
    "Meeting postpone ho gayi, so free time mil gaya.",
    "Food delicious tha, but portion size small tha.",
    "Friends ne surprise party arrange ki mere liye.",
    "Exam preparation kar raha hoon, but stressed feel kar raha.",
    "New job offer mila, excited hoon main.",
    "Movie theater mein popcorn khaya, overpriced tha.",
    "Gym jaata hoon daily, fitness maintain karne ke liye.",
    "Birthday gift surprise tha, loved it I.",
    "Travel plan bana rahe hain next month ke liye."
]

# Ensure we have enough samples
if len(HINGLISH_SENTENCES) < SAMPLE_SIZE:
    SAMPLE_SIZE = len(HINGLISH_SENTENCES)

# --- Helper Functions ---
def detect_language(word):
    """Detect language of a word using langdetect."""
    try:
        return detect(word)
    except LangDetectException:
        return 'und'  # undetermined

def find_code_switch_boundaries(sentence_text):
    """Find word indices where language switches occur."""
    words = sentence_text.split()
    if len(words) < 2:
        return []
    langs = [detect_language(word) for word in words]
    boundaries = []
    for i in range(len(words) - 1):
        if langs[i] != langs[i+1] and langs[i] != 'und' and langs[i+1] != 'und':
            boundaries.append((i, i+1))  # from word i to i+1
    return boundaries

def create_word_to_token_map(sentence_text, tokenizer):
    """Create a map from word indices to token indices."""
    words = sentence_text.split()
    encoding = tokenizer(sentence_text, return_offsets_mapping=True, return_tensors="pt")
    offset_mapping = encoding['offset_mapping'][0].tolist()

    word_to_tokens = [[] for _ in words]
    current_word = 0
    for token_idx, (start, end) in enumerate(offset_mapping):
        if start == 0 and end == 0:
            continue  # special tokens
        # Find which word this token belongs to
        while current_word < len(words) and (start >= len(sentence_text) or not sentence_text[start:end].startswith(words[current_word][0])):
            current_word += 1
        if current_word < len(words):
            word_to_tokens[current_word].append(token_idx)

    return word_to_tokens, encoding

def calculate_cs_attention_for_head(attention_matrix, word_to_tokens, boundaries):
    """Calculate Code-Switch Attention Score for a single head on a single sentence."""
    if not boundaries:
        return 0.0

    total_cs_att = 0.0
    num_boundaries = 0

    for from_word, to_word in boundaries:
        from_tokens = np.array(word_to_tokens[from_word])
        to_tokens = np.array(word_to_tokens[to_word])
        if len(from_tokens) == 0 or len(to_tokens) == 0:
            continue

        # Average attention from from_tokens to to_tokens
        att_from_to = attention_matrix[from_tokens[:, None], to_tokens].mean()
        total_cs_att += att_from_to
        num_boundaries += 1

    if num_boundaries == 0:
        return 0.0
    return total_cs_att / num_boundaries

def run_cs_analysis(model, tokenizer, dataset):
    """
    Runs the Code-Switch Attention analysis on mBERT for Hinglish sentences.
    """
    device = model.device
    model.eval()
    head_scores = {
        (l, h): []
        for l in range(model.config.num_hidden_layers)
        for h in range(model.config.num_attention_heads)
    }

    desc = f"Running CS Analysis on Hinglish ({len(dataset)} samples)"
    for sentence_text in tqdm(dataset, desc=desc):
        if not sentence_text: continue

        boundaries = find_code_switch_boundaries(sentence_text)
        if not boundaries: continue  # Skip if no switches detected

        word_to_tokens, encoding = create_word_to_token_map(sentence_text, tokenizer)

        model_inputs = {k: v.to(device) for k, v in encoding.items() if k != 'offset_mapping'}

        with torch.no_grad():
            outputs = model(**model_inputs)

        attentions = outputs.attentions

        for layer_idx in range(model.config.num_hidden_layers):
            for head_idx in range(model.config.num_attention_heads):
                attention_matrix = attentions[layer_idx][0, head_idx].cpu().numpy()
                cs_score = calculate_cs_attention_for_head(attention_matrix, word_to_tokens, boundaries)
                head_scores[(layer_idx, head_idx)].append(cs_score)

    # Aggregate results
    results_data = []
    for (layer, head), scores in head_scores.items():
        avg_score = np.mean(scores) if scores else 0
        results_data.append({"Layer": layer, "Head": head, "CS_Score": avg_score})

    return pd.DataFrame(results_data)

print("All helper functions defined.")

# --- 5. Load Model and Tokenizer ---
print("\nSTEP 5: Loading mBERT model and tokenizer...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME, output_attentions=True)
model.to(device)
model.eval()
print(f"Model '{MODEL_NAME}' loaded successfully.")

# --- 6. Prepare Dataset and Run Analysis ---
print("\n" + "─" * 80)
print("STEP 6: Running Code-Switch Attention Analysis on Hinglish Dataset")
print("─" * 80)

# Use hardcoded Hinglish sentences as dataset
dataset = HINGLISH_SENTENCES[:SAMPLE_SIZE]
data_hinglish = [{"text": sent} for sent in dataset]

print(f"Using {len(data_hinglish)} Hinglish sentences for analysis.")

# Run analysis
df_cs = run_cs_analysis(model, tokenizer, [d["text"] for d in data_hinglish])

print("\nHinglish CS analysis complete.")
print(df_cs.sort_values(by="CS_Score", ascending=False).head(10).to_string(index=False))

# --- 7. Optional: Baseline on Pure Sentences (to compare activity) ---
print("\n" + "─" * 80)
print("STEP 7: Baseline Inter-Token Attention on Pure English/Hindi (for comparison)")
print("─" * 80)

# Sample pure English and Hindi sentences (hardcoded)
PURE_ENGLISH = [
    "The quick brown fox jumps over the lazy dog.",
    "I am going to the market to buy some fruits.",
    "She reads books every evening after dinner.",
    "The weather is nice today for a walk.",
    "He plays football with his friends on weekends."
]

PURE_HINDI_ROMAN = [  # Romanized for mBERT
    "Main bazaar ja raha hoon phal kharidne ke liye.",
    "Vah har shaam khana khane ke baad kitabein padhti hai.",
    "Aaj mausam ghumne ke liye achha hai.",
    "Vah apne doston ke saath antakshari khelta hai.",
    "Yeh ghar bahut saaf hai."
]

pure_sentences = PURE_ENGLISH + PURE_HINDI_ROMAN
data_pure = [{"text": sent} for sent in pure_sentences[:SAMPLE_SIZE//2]]

def calculate_baseline_attention_for_head(attention_matrix, word_to_tokens, num_words):
    """Calculate average inter-word attention as baseline (no switches)."""
    if num_words < 2:
        return 0.0
    total_att = 0.0
    num_pairs = 0
    for from_word in range(num_words - 1):
        to_word = from_word + 1
        from_tokens = np.array(word_to_tokens[from_word])
        to_tokens = np.array(word_to_tokens[to_word])
        if len(from_tokens) > 0 and len(to_tokens) > 0:
            att = attention_matrix[from_tokens[:, None], to_tokens].mean()
            total_att += att
            num_pairs += 1
    if num_pairs == 0:
        return 0.0
    return total_att / num_pairs

def run_baseline_analysis(model, tokenizer, dataset):
    """Run baseline inter-token attention analysis on pure sentences."""
    device = model.device
    model.eval()
    head_scores = {
        (l, h): []
        for l in range(model.config.num_hidden_layers)
        for h in range(model.config.num_attention_heads)
    }

    desc = f"Running Baseline Analysis on Pure Sentences ({len(dataset)} samples)"
    for sentence_text in tqdm(dataset, desc=desc):
        if not sentence_text: continue

        word_to_tokens, encoding = create_word_to_token_map(sentence_text, tokenizer)
        num_words = len(word_to_tokens)

        model_inputs = {k: v.to(device) for k, v in encoding.items() if k != 'offset_mapping'}

        with torch.no_grad():
            outputs = model(**model_inputs)

        attentions = outputs.attentions

        for layer_idx in range(model.config.num_hidden_layers):
            for head_idx in range(model.config.num_attention_heads):
                attention_matrix = attentions[layer_idx][0, head_idx].cpu().numpy()
                baseline_score = calculate_baseline_attention_for_head(attention_matrix, word_to_tokens, num_words)
                head_scores[(layer_idx, head_idx)].append(baseline_score)

    # Aggregate results
    results_data = []
    for (layer, head), scores in head_scores.items():
        avg_score = np.mean(scores) if scores else 0
        results_data.append({"Layer": layer, "Head": head, "Baseline_Score": avg_score})

    return pd.DataFrame(results_data)

# Run baseline
df_baseline = run_baseline_analysis(model, tokenizer, [d["text"] for d in data_pure])

print("\nPure sentences baseline complete.")
print(df_baseline.sort_values(by="Baseline_Score", ascending=False).head(10).to_string(index=False))

# --- 8. Synthesize Results ---
print("\n" + "─" * 80)
print("STEP 8: Synthesizing results and identifying Code-Switch Heads...")
print("─" * 80)

# Merge CS and baseline
df_cs_merged = pd.merge(df_cs, df_baseline, on=['Layer', 'Head'])

# Calculate differential score: CS_Score - Baseline_Score (higher = more active at switches)
df_cs_merged['CS_Diff'] = df_cs_merged['CS_Score'] - df_cs_merged['Baseline_Score']

# Finding: Code-Switch Heads (High differential)
print("\n--- FINDING: Code-Switch Heads (High activity at language boundaries) ---")
print(df_cs_merged.sort_values(by='CS_Diff', ascending=False).head(10).to_string(index=False, float_format="%.4f"))

# --- 9. Save Final Report ---
print("\n" + "─" * 80)
print("STEP 9: Saving final report...")
print("─" * 80)

output_filename = "df_code_switch.csv"
df_cs_merged.to_csv(output_filename, index=False, float_format="%.4f")
files.download(output_filename)

print(f"Successfully saved and downloaded '{output_filename}'.")
print("PHASE 7.2 COMPLETE")
print("─" * 80)

────────────────────────────────────────────────────────────────────────────────
PHASE 7.2: Code-Switching Analysis (Hinglish)
────────────────────────────────────────────────────────────────────────────────
STEP 1: Installing all required packages...
Installing transformers...
Installing accelerate...
Installing stanza...
Installing numpy...
Installing pandas...
Installing langdetect...
All packages installed successfully.

STEP 2: Importing libraries...
Libraries imported successfully.

STEP 3: Downloading Stanza models...
Downloading Stanza English ('en') model...
Downloading Stanza Hindi ('hi') model...
Stanza models downloaded successfully!

STEP 4: Defining helper functions...
All helper functions defined.

STEP 5: Loading mBERT model and tokenizer...
Using device: cuda
Model 'bert-base-multilingual-cased' loaded successfully.

────────────────────────────────────────────────────────────────────────────────
STEP 6: Running Code-Switch Attention Analysis on Hinglish Dataset
──────

Running CS Analysis on Hinglish (20 samples):   0%|          | 0/20 [00:00<?, ?it/s]


Hinglish CS analysis complete.
 Layer  Head  CS_Score
     1     2  0.620804
     6     9  0.609222
     7    11  0.486943
     5     3  0.460247
     9     9  0.415582
    10     0  0.409599
     6     5  0.384597
     3    10  0.365830
     6    11  0.246378
     8     4  0.233766

────────────────────────────────────────────────────────────────────────────────
STEP 7: Baseline Inter-Token Attention on Pure English/Hindi (for comparison)
────────────────────────────────────────────────────────────────────────────────


Running Baseline Analysis on Pure Sentences (10 samples):   0%|          | 0/10 [00:00<?, ?it/s]


Pure sentences baseline complete.
 Layer  Head  Baseline_Score
     1     2        0.630348
     6     9        0.618346
     7    11        0.453477
     5     3        0.447300
     9     9        0.397990
    10     0        0.394337
     3    10        0.316799
     6     5        0.303643
     5     8        0.276986
     0     6        0.215859

────────────────────────────────────────────────────────────────────────────────
STEP 8: Synthesizing results and identifying Code-Switch Heads...
────────────────────────────────────────────────────────────────────────────────

--- FINDING: Code-Switch Heads (High activity at language boundaries) ---
 Layer  Head  CS_Score  Baseline_Score  CS_Diff
     6     5    0.3846          0.3036   0.0810
     6    11    0.2464          0.1777   0.0686
     3    10    0.3658          0.3168   0.0490
     8     3    0.0783          0.0371   0.0412
     8     4    0.2338          0.1978   0.0359
     4     2    0.1299          0.0945   0.0353
     7

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Successfully saved and downloaded 'df_code_switch.csv'.
PHASE 7.2 COMPLETE
────────────────────────────────────────────────────────────────────────────────
