In [1]:
import sys
sys.path.append('../..')
import os
# Setting environment variables using os.environ
os.environ["RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = "12348"
os.environ["RAYON_NUM_THREADS"] = "48"
os.environ["HF_HOME"] = "/srv/scratch/CRUISE/Mehdi/HF"


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import os
import random
import torch
import numpy as np
import torch.optim as optim
import torch.distributed as dist
from dataclasses import fields
from tqdm import tqdm
from transformers import get_cosine_schedule_with_warmup
from peft import LoraConfig
from lit.configs.train_config import train_config
from lit.configs.peft_config import lora_config
from lit.utils.ToM_reading_utils import get_NegotiationToM_text
from lit.utils.infra_utils import (
    get_logger, get_ema, update_ema, update_config,
    get_tokenizer, get_model
)
from lit.utils.activation_utils import latent_qa

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

target_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


Unsloth currently does not support multi GPU setups - but we are working on it!
Multiple CUDA devices detected but we require a single device.
We will override CUDA_VISIBLE_DEVICES to first device: GPU-416d5a5f-1d6c-05e5-bc7b-c89820704bc6.


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.1.
   \\   /|    GPU: NVIDIA L40S. Max memory: 44.422 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [4]:
kwargs = {}
# Distributed Training Setup
dist.init_process_group("nccl")
assert torch.cuda.is_available()
args = train_config()
update_config(args, **kwargs)

fsdp_args = None
if args.use_fsdp:
    from lit.configs.fsdp_config import fsdp_config
    fsdp_args = fsdp_config()
    update_config(fsdp_args, **kwargs)
print(fsdp_args)

rank = dist.get_rank()
device = rank % torch.cuda.device_count()
torch.cuda.set_device(device)
torch.cuda.empty_cache()
seed = args.seed * dist.get_world_size() + rank
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")
logger = get_logger(args, rank)

# Tokenizer and Dataset Preparation
args.target_model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
args.eval_every_n_steps = 3*334
args.num_epochs = 3
# tokenizer = get_tokenizer(args.target_model_name)

def tokenize_data(example):
    prompt = f"Question: {example['input']}\nAnswer: {example['output']}"
    return tokenizer(prompt, truncation=True, padding="max_length", max_length=750, return_tensors="pt")

args.train_qa = "../../data/NegotiationToM/train.json"
args.eval_qa = "../../data/NegotiationToM/valid.json"
args.is_steer = False

# Training Dataset
read_prompts, QAs = get_NegotiationToM_text(args, tokenizer, True)
data = [{
    'input': str(rp+[qa[0]]),
    'output': '=> '+ qa[1]['content']
} for rp, qa in zip(read_prompts, QAs)]
dataset = Dataset.from_list(data).map(tokenize_data, batched=False)
train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Evaluation Dataset
read_prompts, QAs = get_NegotiationToM_text(args, tokenizer, False)
data = [{
    'input': str(rp+[qa[0]]),
    'output': '=> '+ qa[1]['content']
} for rp, qa in zip(read_prompts, QAs)]
dataset = Dataset.from_list(data).map(tokenize_data, batched=False)
eval_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Model Initialization
lora_params = {k.name: getattr(lora_config(), k.name) for k in fields(lora_config())}
peft_config = LoraConfig(**lora_params)
# target_model = get_model(
#     args.target_model_name,
#     peft_config=peft_config,
#     fsdp_args=fsdp_args,
#     device=device,
#     rank=rank,
#     distributed_training=True,
# )
# target_model = model
torch.cuda.empty_cache()

# Exponential Moving Average
ema = get_ema(target_model, decay=args.ema_decay, device=device)

# Optimizer and Scheduler
optimizer = optim.AdamW(target_model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
training_steps = len(train_dataloader) * args.num_epochs
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=training_steps)

# Training Loop
train_steps = 0
for epoch in range(args.num_epochs):
    target_model.train()
    total_length = len(train_dataloader) // args.gradient_accumulation_steps
    pbar = tqdm(colour="blue", desc=f"Training Epoch: {epoch+1}", total=total_length, dynamic_ncols=True)
    
    for step, batch in enumerate(train_dataloader):
        inputs = {key: torch.tensor(val).squeeze(1).to(device) for key, val in batch.items() if key in ["input_ids", "attention_mask"]}
        outputs = target_model(**inputs, labels=inputs["input_ids"])
        
        loss = outputs.loss / args.gradient_accumulation_steps
        loss.backward()
        train_steps += 1
        
        if train_steps % args.gradient_accumulation_steps == 0:
            if args.gradient_clipping and args.gradient_clipping_threshold > 0.0:
                torch.nn.utils.clip_grad_norm_(target_model.parameters(), args.gradient_clipping_threshold)
            optimizer.step()
            optimizer.zero_grad()
            update_ema(ema, target_model, decay=args.ema_decay)
            pbar.update(1)
        
        pbar.set_description(f"Training Epoch: {epoch+1}/{args.num_epochs}, batch {step+1}/{len(train_dataloader)} completed (loss: {loss.detach().float()})")
        
        # Periodic Evaluation
        if args.eval_ppl and train_steps % args.eval_every_n_steps == 0:
            total_loss = 0.0
            pbar = tqdm(colour="green", desc=f"Evaluating Epoch: {epoch+1}", total=len(eval_dataloader), dynamic_ncols=True)
            
            for step, batch in enumerate(eval_dataloader):
                inputs = {key: torch.tensor(val).squeeze(1).to(device) for key, val in batch.items() if key in ["input_ids", "attention_mask"]}
                outputs = target_model(**inputs, labels=inputs["input_ids"])
                total_loss += outputs.loss.detach().float()
                pbar.update(1)
            print(f"The loss value of the evaluation epoch is {total_loss}")
    
    # End of epoch
    scheduler.step()
    pbar.close()

# Test Dataset
args.eval_qa = "../../data/NegotiationToM/test.json"
read_prompts, QAs = get_NegotiationToM_text(args, tokenizer, False)
data = [{
    'input': str(rp+[qa[0]]),
    'output': '=> '
} for rp, qa in zip(read_prompts, QAs)]
dataset = Dataset.from_list(data).map(tokenize_data, batched=False)
test_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
pbar = tqdm(colour="red", desc=f"Inference : ", total=len(test_dataloader), dynamic_ncols=True)

for step, batch in enumerate(test_dataloader):
    inputs = {key: torch.tensor(val).squeeze(1).to(device) for key, val in batch.items() if key in ["input_ids", "attention_mask"]}
    output_ids = target_model.module.generate(inputs["input_ids"], max_new_tokens=150)
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print("Generated Text:", generated_text)
    print('\nThis is the end of the current sample\n')
    pbar.update(1)



[W131 11:45:21.327004684 socket.cpp:752] [c10d] The client socket cannot be initialized to connect to [localhost]:12348 (errno: 97 - Address family not supported by protocol).
[[34m2025-01-31 11:45:21[0m] Experiment directory created at /srv/scratch/CRUISE/Mehdi/out/runs-2/085


None
Starting rank=0, seed=42, world_size=1.


Map:   0%|          | 0/1335 [00:00<?, ? examples/s]

Map:   0%|          | 0/334 [00:00<?, ? examples/s]

Training Epoch: 1/3, batch 1002/1335 completed (loss: 5.3210954666137695):  75%|[34m███████▌  [0m| 1002/1335 [01:36<00:32, 10.39it/s]


The loss value of the evaluation epoch is 3863.80078125


Training Epoch: 1/3, batch 1335/1335 completed (loss: 3.4494495391845703): : 667it [00:46, 14.32it/s]
Training Epoch: 2/3, batch 669/1335 completed (loss: 9.436586380004883):  50%|[34m█████     [0m| 669/1335 [01:04<01:04, 10.38it/s] 


The loss value of the evaluation epoch is 3863.7998046875


Training Epoch: 2/3, batch 1335/1335 completed (loss: 10.768994331359863): : 1000it [01:18, 12.72it/s]
Training Epoch: 3/3, batch 336/1335 completed (loss: 13.304550170898438):  25%|[34m██▌       [0m| 336/1335 [00:32<01:36, 10.39it/s]


The loss value of the evaluation epoch is 3863.800048828125


Training Epoch: 3/3, batch 1335/1335 completed (loss: 12.943473815917969): : 1333it [01:51, 12.00it/s]


Map:   0%|          | 0/711 [00:00<?, ? examples/s]

Inference :   0%|[31m          [0m| 0/711 [00:00<?, ?it/s]

AttributeError: 'LlamaForCausalLM' object has no attribute 'module'

In [7]:
def tokenize_data(example):
    prompt = f"Question: {example['input']}\nAnswer: "
    return tokenizer(prompt, truncation=True, padding="max_length", max_length=1000, return_tensors="pt")

args.eval_qa = "../../data/NegotiationToM/test.json"
read_prompts, QAs = get_NegotiationToM_text(args, tokenizer, False)
data = [{
    'input': str(rp+[qa[0]]),
    'output': str(qa[1])
} for rp, qa in zip(read_prompts, QAs)]

data[0]
print(data[2])

{'input': "[{'role': 'Agent 2', 'content': 'Hello. How are you?'}, {'role': 'Agent 1', 'content': 'I am good. I am pretty excited for the trip this weekend. what about you?'}, {'role': 'Agent 2', 'content': 'Very excited. It will be fun.'}, {'role': 'Agent 1', 'content': 'Yes, Hopefully the weather holds up. So I was thinking, i am bringning my two kids along with me, and they will need food, so I was thinking i could take the three foods, and either one of the firewood or water, up to you.'}, {'role': 'Agent 2', 'content': 'I would also like a little extra food for my kids. Maybe we can split it somehow?'}, {'role': 'Agent 1', 'content': 'Ok, I am willing to give you one food, in exchange for two firewoods, that would mean you get 3 waters, 1 food and 1 firewood. you get 5 items, while i get 4.'}, {'role': 'user', 'content': 'what is the intent of each agent for the last utterances? What are the beliefs and desires of each agent?'}]", 'output': "{'role': 'assistant', 'content': 'The i