In [1]:
# --- Imports ---
import torch
import os
import random
import json
import pickle
from tqdm import tqdm
from jinja2 import Template
import torch.nn as nn
from transformers import LlamaForCausalLM, AutoTokenizer
import numpy as np

# --- HARDCODED CONSTANTS (Llama 3.2 Updates) ---
MODEL_NAME = "meta-llama/Llama-3.2-3B"
TASK_FILE_NAME = "samanantar_hindi.json"
SHOTS = 5
MOD = "GV_trace_latest_up"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

BASE_DIR ="new_folder"

DATA_PATH_ROOT = BASE_DIR 

# --- Directory Setup ---
os.makedirs(os.path.join(BASE_DIR, 'data_token'), exist_ok=True)
os.makedirs(os.path.join(BASE_DIR, 'matrix'), exist_ok=True)

# Set Device
# Checks for CUDA and falls back to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
# --- Utility Functions ---

def data_construct(raw_instances, instruction, shot=SHOTS):
    """Formats raw instances and instruction into a list of messages for the chat template."""
    if isinstance(instruction, list):
        instruction = "\n".join(instruction)

    messages = [{"role": "system", "content": instruction.strip()}]
    
    if shot > 0 and raw_instances:
        num_examples = min(shot, len(raw_instances))
        few_shot_examples = raw_instances[:num_examples] 
        
        for example in few_shot_examples:
            user_content = example["input"].strip()
            messages.append({"role": "user", "content": user_content})
            
            output = example["output"]
            assistant_content = output[0].strip() if isinstance(output, list) else output.strip()
            messages.append({"role": "assistant", "content": assistant_content})

    return messages

def find_all_sublists(haystack, needle):
    """Finds all start indices of a sublist (needle) in a list (haystack)."""
    sublist_indices = []
    n = len(needle)
    for i in range(len(haystack) - n + 1):
        if haystack[i:i + n] == needle:
            sublist_indices.append(list(range(i, i + n)))
    return sublist_indices

In [4]:
# --- Llama 3.2 Marker Token IDs (No Change Needed Here) ---

sub_squence = {
    "<|start_header_id|>":[128006], 
    "<|end_header_id|>":[128007],          
    "<|eot_id|>":[128009]                  
}
sub_squence_list = [
    sub_squence["<|start_header_id|>"], 
    sub_squence["<|end_header_id|>"],
    sub_squence["<|eot_id|>"]
]

# --- Hardcoded Translation Task Instruction ---
# Since your new JSON file is a raw list of data, the task instruction must be defined here.
INSTRUCTION = "Translate the following English sentence to Hindi. Only output the Hindi translation."
# Update the TASK_FILE_NAME constant to match the uploaded file
TASK_FILE_NAME = "samanantar_hindi_10.json"
# --- Data Loading and Preparation (ADJUSTED FOR NEW FILE STRUCTURE) ---

task = TASK_FILE_NAME.split("_")[0]
data_path = os.path.join(DATA_PATH_ROOT, TASK_FILE_NAME)
print(f"Attempting to load data from: {data_path}")

try:
    with open(data_path, "r", encoding="utf-8") as f: # Added encoding for safety with Hindi
        # The JSON load result is the list of instances itself
        instance_data = json.load(f)
        
        # Reformat instances to match old 'input'/'output' structure for data_construct
        instance = []
        for item in instance_data:
            instance.append({
                # 'src' is the input for the model, 'tgt' is the label/output
                "input": item["input"]["src"], 
                "output": [item["input"]["tgt"]] 
            })

        data_number = len(instance)
        
        # Split data
        train, test = instance[:data_number//2], instance[data_number//2:]
        
        # Prepare list of messages for each train/test sample
        # Note: 'train_list' is a list where each element is the full chat history for one sample.
        train_list = [data_construct(train, INSTRUCTION, shot=SHOTS) for _ in train]
        test_list = [data_construct(test, INSTRUCTION, shot=SHOTS) for _ in test]
        train_list=[train_list[0]]
        
        
except FileNotFoundError:
    print(f"\n🚨 ERROR: Task file not found at {data_path}. \n   Please check your DATA_PATH_ROOT.")
    raise
except json.JSONDecodeError:
    print("\n🚨 ERROR: Could not decode JSON. Ensure the file is correctly formatted.")
    raise


# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=True)

Attempting to load data from: new_folder\samanantar_hindi_10.json


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [5]:
train_list

[[{'role': 'system',
   'content': 'Translate the following English sentence to Hindi. Only output the Hindi translation.'},
  {'role': 'user',
   'content': "However, Paes, who was partnering Australia's Paul Hanley, could only go as far as the quarterfinals where they lost to Bhupathi and Knowles"},
  {'role': 'assistant',
   'content': 'आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाले पेस मियामी में क्वार्टरफाइनल तक ही पहुंच सके क्योंकि इस दौर में उन्हें भूपति और नोल्स ने हराया था।'},
  {'role': 'user',
   'content': 'Whosoever desires the reward of the world, with Allah is the reward of the world and of the Everlasting Life. Allah is the Hearer, the Seer.'},
  {'role': 'assistant',
   'content': 'और जो शख्स (अपने आमाल का) बदला दुनिया ही में चाहता है तो ख़ुदा के पास दुनिया व आख़िरत दोनों का अज्र मौजूद है और ख़ुदा तो हर शख्स की सुनता और सबको देखता है'},
  {'role': 'user',
   'content': 'The value of insects in the biosphere is enormous because they outnumber all other living groups in

In [6]:
# --- Tokenization and Tracing Indexing (Train Data) ---

# Re-defining the template manually in case the tokenizer was loaded without it
# This assumes the Llama 3.x template (since we are using Llama 3.2 model)
LLAMA3_CHAT_TEMPLATE = (
    "{% for message in messages %}"
        "{% if loop.index == 1 %}"
            "{{ '<|begin_of_text|>' }}"
        "{% endif %}"
        "{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}"
        "{{ message['content'] | trim }}"
        "{{ '<|eot_id|>' }}"
    "{% endfor %}"
    "{% if add_generation_prompt %}"
        "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
    "{% endif %}"
)

# NOTE: If you haven't assigned this manually in the loading cell, do it here to prevent the error
# tokenizer.default_chat_template = LLAMA3_CHAT_TEMPLATE


train_data_token_dir = os.path.join(BASE_DIR, f'data_token/{task}')
if "trace" not in MOD:
    train_file = os.path.join(train_data_token_dir, f'train_{str(SHOTS)}.pkl')
else:
    train_file = os.path.join(train_data_token_dir, f'train_trace_{str(SHOTS)}.pkl')

if os.path.exists(train_file):
    print(f"Loading pre-tokenized train data from {train_file}")
    with open(train_file, 'rb') as f:
        data = pickle.load(f)
        train_token = data["inputs"]
        indexs = data.get("indexs", [])
else:
    print("Tokenizing train data and calculating trace indices...")
    train_token = []
    indexs = []
    
    # Ensure all required variables (SHOTS, train_list, etc.) are defined 
    # and the tokenizer template is set BEFORE this cell runs.
    
    # Check if the attribute exists; if not, use the manually defined one:
    try:
        template_str = tokenizer.default_chat_template
    except AttributeError:
        print("Using manually defined Llama 3 chat template.")
        template_str = LLAMA3_CHAT_TEMPLATE
        
    template = Template(template_str)
    
    progress_bar = tqdm(total=len(train_list), desc='Train Processing data')
    
    for message in train_list:
        progress_bar.update(1)
        
        # Preserve original code's empty BOS/EOS - NOTE: This might lead to suboptimal results
        # as Llama 3 expects the tokens, but we honor the original code's structure.
        bos_token = "" 
        eos_token = ""
        
        # Use template to render the message
        result = template.render(messages=message, bos_token=bos_token, eos_token=eos_token, add_generation_prompt=False).replace("<spe>"," ")
        print(result)
        input_ids = tokenizer.encode(result)
        train_token.append(input_ids)
        

        if "trace" in MOD:
            track_index = []
            for seq in sub_squence_list:
                 track_index.extend(find_all_sublists(input_ids, seq))
            
            lat_list = [item for sublist in track_index for item in sublist]
            indexs.append(sorted(list(set(lat_list))))
            
    progress_bar.close()
    

Tokenizing train data and calculating trace indices...
Using manually defined Llama 3 chat template.


Train Processing data: 100%|██████████| 1/1 [00:00<00:00, 141.86it/s]

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Translate the following English sentence to Hindi. Only output the Hindi translation.<|eot_id|><|start_header_id|>user<|end_header_id|>

However, Paes, who was partnering Australia's Paul Hanley, could only go as far as the quarterfinals where they lost to Bhupathi and Knowles<|eot_id|><|start_header_id|>assistant<|end_header_id|>

आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाले पेस मियामी में क्वार्टरफाइनल तक ही पहुंच सके क्योंकि इस दौर में उन्हें भूपति और नोल्स ने हराया था।<|eot_id|><|start_header_id|>user<|end_header_id|>

Whosoever desires the reward of the world, with Allah is the reward of the world and of the Everlasting Life. Allah is the Hearer, the Seer.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

और जो शख्स (अपने आमाल का) बदला दुनिया ही में चाहता है तो ख़ुदा के पास दुनिया व आख़िरत दोनों का अज्र मौजूद है और ख़ुदा तो हर शख्स की सुनता और सबको देखता है<|eot_id|><|start_header_id|>user<|end_header_id|>

The 




In [7]:
train_token

[[128000,
  128000,
  128006,
  9125,
  128007,
  271,
  28573,
  279,
  2768,
  6498,
  11914,
  311,
  45080,
  13,
  8442,
  2612,
  279,
  45080,
  14807,
  13,
  128009,
  128006,
  882,
  128007,
  271,
  11458,
  11,
  16056,
  288,
  11,
  889,
  574,
  70220,
  8494,
  596,
  7043,
  21296,
  3258,
  11,
  1436,
  1193,
  733,
  439,
  3117,
  439,
  279,
  8502,
  12085,
  82,
  1405,
  814,
  5675,
  311,
  31930,
  455,
  67631,
  323,
  14521,
  645,
  128009,
  128006,
  78191,
  128007,
  271,
  102393,
  79468,
  100431,
  86133,
  101385,
  100322,
  24810,
  48909,
  35470,
  84736,
  100391,
  85410,
  101276,
  92911,
  44747,
  48909,
  35470,
  69258,
  101029,
  100277,
  114868,
  44747,
  101403,
  100306,
  35470,
  100287,
  100391,
  35470,
  84736,
  101755,
  92317,
  100322,
  100497,
  44747,
  92317,
  100271,
  48909,
  100460,
  100273,
  105461,
  100924,
  106357,
  92911,
  102007,
  85410,
  44747,
  101372,
  103515,
  100411,
  101026,
  35470,


In [8]:
# Creates a new list [128001, 128001, 128001, ...]
new_list = [128001 for x in train_token[0]]

In [9]:
indexs

[[2,
  4,
  20,
  21,
  23,
  59,
  60,
  62,
  143,
  144,
  146,
  183,
  184,
  186,
  275,
  276,
  278,
  303,
  304,
  306,
  374,
  375,
  377,
  391,
  392,
  394,
  423,
  424,
  426,
  463,
  464,
  466,
  532]]

In [10]:
# --- Tokenization and Label Extraction (Test Data) ---

test_data_token_dir = os.path.join(BASE_DIR, f'data_token/{task}')
test_file = os.path.join(test_data_token_dir, f'test_{str(SHOTS)}.pkl')

# Get task instruction template (defined in the data loading cell)
# This uses the manually defined template if the tokenizer doesn't have it set.
try:
    template_str = tokenizer.default_chat_template
except AttributeError:
    # Assuming LLAMA3_CHAT_TEMPLATE was defined earlier
    template_str = LLAMA3_CHAT_TEMPLATE 

template = Template(template_str)

if os.path.exists(test_file):
    print(f"Loading pre-tokenized test data from {test_file}")
    with open(test_file, 'rb') as f:
        data = pickle.load(f)
        test_token = data["inputs"]
        labels = data["labels"]
else:
    print("Tokenizing test data and extracting labels...")
    test_token = []
    labels = []
    
    progress_bar = tqdm(total=len(test_list), desc='Test Processing data') 

    for message_full in test_list:
        progress_bar.update(1)
        
        prompt, output = message_full[:-1], message_full[-1]
        
        bos_token = ""
        eos_token = ""
        
        # Use template to render the prompt
        result = template.render(messages=prompt, bos_token=bos_token, eos_token=eos_token, add_generation_prompt=True) # NOTE: added add_generation_prompt=True here, which is standard when preparing the prompt for generation.
        test_token.append(tokenizer.encode(result))
        labels.append(output["content"])
        
    progress_bar.close()
    
    # --- FIX: Ensure the directory exists before writing ---
    os.makedirs(test_data_token_dir, exist_ok=True)
    
    with open(test_file, 'wb') as f:
        pickle.dump({"inputs": test_token, "labels": labels}, f)
    print(f"Saved tokenized test data to {test_file}")

Loading pre-tokenized test data from new_folder\data_token/samanantar\test_5.pkl


In [11]:
# --- Model Loading and Gradient Tracing ---

print("Loading model for gradient tracing...")

# Load the model with bfloat16 and 'auto' device map
model = LlamaForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token=True
)
model.train()
criterion = nn.CrossEntropyLoss(reduction="none")

# Llama 3.2 3B FFN intermediate size is 11008 (32 layers)
out_data = [[0.0] * 8192 for _ in range(28)] 
ss = 0

print("Starting gradient tracing process...")
progress_bar = tqdm(total=len(train_token), desc='Getting data')

for input_ids, index in zip(train_token, indexs):
    progress_bar.update(1)
    
    if len(input_ids) > 1300:
        ss += 1
        continue
    
    input_index = [i - 1 for i in index] 
    label_token = [input_ids[i] for i in index]

    input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)
    label_token = torch.tensor(label_token, dtype=torch.long).to(device)
    
    output = model(input_ids)
    
    # Slicing the loss calculation (28 tokens threshold from original code)
    slice1 = 28
    
    logits_slice_1 = output.logits[0, input_index[:slice1], :]
    labels_slice_1 = label_token[:slice1]
    
    logits_slice_2 = output.logits[0, input_index[slice1:], :]
    labels_slice_2 = label_token[slice1:]
    
    loss1 = criterion(logits_slice_1, labels_slice_1)
    loss2 = criterion(logits_slice_2, labels_slice_2)
    
    loss = loss1.mean() + loss2.mean()
    
    model.zero_grad()
    loss.backward()

    # Accumulate Gradients for the up_proj layer
    for name, param in model.named_parameters():
        if param.grad is not None and "up_proj" in name:
            layer = int(name.split(".")[2])
            
            grad = torch.sum(param.grad.abs(), dim=1).cpu().tolist()
            
            out_data[layer] = [a + b for a, b in zip(grad, out_data[layer])]


progress_bar.close()
print(f"Skipped {ss} samples due to length > 1300.")
print("Gradient tracing complete.")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading model for gradient tracing...


Fetching 2 files: 100%|██████████| 2/2 [01:51<00:00, 55.92s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.77s/it]


Starting gradient tracing process...


Getting data: 100%|██████████| 1/1 [00:07<00:00,  7.90s/it]

Skipped 0 samples due to length > 1300.
Gradient tracing complete.





In [12]:
# --- Save Results ---

matrix_dir = os.path.join(BASE_DIR, f'matrix/{task}')
os.makedirs(matrix_dir, exist_ok=True)
output_path = os.path.join(matrix_dir, f'{MOD}.json')

with open(output_path, "w") as f:
    json.dump(out_data, f)
    
print(f"Successfully saved tracing matrix to: {output_path}")

Successfully saved tracing matrix to: new_folder\matrix/samanantar\GV_trace_latest_up.json
