In [None]:
import torch                       # PyTorch library for deep learning

from transformers import (
    AutoModelForCausalLM,          # AutoModel for language modeling tasks
    AutoTokenizer,                # AutoTokenizer for tokenization
    BitsAndBytesConfig,           # Configuration for BitsAndBytes
    HfArgumentParser,             # Argument parser for Hugging Face models
    TrainingArguments,            # Training arguments for model training
    pipeline,                     # Creating pipelines for model inference
    logging,                      # Logging information during training
)
from peft import LoraConfig, PeftModel  # Packages for parameter-efficient fine-tuning (PEFT)
from trl import SFTTrainer         # SFTTrainer for supervised fine-tuning 
import argparse
from datasets import load_from_disk, Dataset, load_dataset
from itertools import chain
from functools import partial
from transformers import AutoTokenizer

access_token = "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"
model_name = "meta-llama/Llama-2-7b-chat-hf"
qlora_training = False
tokenizer = AutoTokenizer.from_pretrained(model_name,use_auth_token=access_token) # previous authentication with HuggingFace needed, see here https://huggingface.co/docs/huggingface_hub/main/en/package_reference/login
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
device_map = "auto"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config,
    device_map=device_map,
    use_auth_token=access_token
)

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader

# Load the dataset
ds = load_dataset("wikipedia", "20220301.simple")

# Calculate the size of the subsample (1% of the 'train' split)
subsample_size = int(0.1 * len(ds['train']))

# Create a random subsample of the dataset
subsample = ds['train'].shuffle(seed=42).select(range(subsample_size))

# Load tokenizer for your model
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
# Tokenize the subsample
def tokenize_function(examples):
    # Truncation and padding are typically handled here if necessary
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_docs = subsample.map(tokenize_function, batched=True)

In [None]:
example = tokenized_docs[0]
print(example.keys())  # Should include 'input_ids' and potentially 'attention_mask'
print(example['input_ids'][:10])  # Print first 10 token IDs to inspect

In [None]:
from torch.utils.data import DataLoader

def select_model_inputs(batch):
    return {
        "input_ids": batch["input_ids"],
        "attention_mask": batch["attention_mask"]
    }

# Apply the function to filter out only the necessary fields
model_inputs = tokenized_docs.map(select_model_inputs, batched=True)

# Manually collate a batch
def manual_collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    return {
        'input_ids': torch.tensor(input_ids, dtype=torch.long),
        'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
    }

# Create a DataLoader with the manual collate function
dataloader = DataLoader(model_inputs, batch_size=16, collate_fn=manual_collate_fn)

For each batch, the model performs a forward pass, receiving a batch of tokenized input sequences (input_ids) and the same sequences as labels (labels). In the context of causal language modeling (assuming the use of a model like GPT or similar from the Transformers library), the model attempts to predict each token in the sequence given the tokens that precede it.

Loss: The model's output includes logits, which are the raw, unnormalized scores for each token in the vocabulary, for each position in the input sequence. When labels are provided (as they are here, with labels=batch["input_ids"]), the model also calculates the loss. This loss is typically the Cross-Entropy Loss between the logits (predictions) and the provided labels (targets), averaged over the batch. In simpler terms, the loss measures how well the model's predictions match the actual next tokens in the input sequences.

    The Cross-Entropy Loss is calculated for each token position in each sequence, comparing the model's prediction (the probability distribution over all possible tokens) with the actual token (represented as a one-hot encoded vector). The loss is high if the model's predicted probabilities diverge significantly from the actual token and low if the model's predictions are accurate.
    Since the labels are the same as the input IDs (the model is predicting the next token in the sequence, and each token is used as its own label), the loss effectively measures how well the model can continue the sequence at each step.

In [None]:
import torch
import torch.nn.functional as F
import numpy as np
import pickle

model.eval()  # Set the model to evaluation mode
losses = np.array([])  # A list to store our calculated losses
c = 0
# Assuming 'dataloader' is your DataLoader instance
for batch in dataloader:
    try:
        with torch.no_grad():  # Disable gradient calculation
            outputs = model(input_ids=batch["input_ids"], labels=batch["input_ids"])
            logits = outputs.logits  # The model's predictions
            
            # Calculate Cross Entropy Loss manually for each token.
            # The model's logits are typically of shape [batch_size, seq_length, vocab_size],
            # and you want to calculate the loss with respect to the true labels for each token.
            # 'labels=batch["input_ids"]' is used for simplicity; replace it as needed.
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), batch["input_ids"].view(-1), reduction='none')
            
            # Reshape loss back to [batch_size, seq_length] to get per-token loss
            per_token_loss = loss.view_as(batch["input_ids"])
            
            # Here, you can process the per_token_loss as needed, for example, averaging over tokens.
            # This example just stores them.
#             losses.append(per_token_loss.cpu().numpy())
            losses = np.concatenate((losses,per_token_loss.cpu().numpy().flatten()))
            if c % 2 == 0:
                with open('losses_pertoken.pkl', 'wb') as file:
                    # Use pickle.dump() to write the list to the file
                    pickle.dump(losses, file)
                    print('{}% of iterations done'.format(c/len(dataloader)))
            c+=1
            
            print('Batch processed')
    except Exception as e:
        print(f"An error occurred: {e}")
        break

# Further processing on `losses` as needed...


In [None]:
len(dataloader)

In [None]:
model.eval()
losses = [] 
k = 0
for batch in dataloader:
    try:
        with torch.no_grad():  # Disable gradient calculation
            outputs = model(input_ids=batch["input_ids"], labels=batch["input_ids"])
            losses.append(outputs.loss.item())
            if k % 10 == 0:
                print(f"Batch loss: {outputs.loss.item()}")
                with open('losses.pkl', 'wb') as file:
                    # Use pickle.dump() to write the list to the file
                    pickle.dump(losses, file)
            k+=1
    except Exception as e:
        print(f"An error occurred: {e}")
        break
