You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi @subramen, I'm trying to perturb some dimensions of parameter values in some layers (like 0-5 layers's all parameter) and freeze it, then do a full parameter fintuning.
I got 2 puzzling problems, the 1st is after loading the pre-trained model, I print all the model.named_parameters and each param size, and it all shows to be torch. size [0], while after the training starts, I print it again, and found as below, only input_layernorm.weight and post_attention_layernorm.weight will update in each layer when fine-tuning, but parameters like self_attn q, k, v,o not. is this behavior normal?
(p.s. I tried both on Llama and llama 2, got the same results.)
Also, the code I used are shown in the bottom. Thanks in advance.
accelerator = Accelerator(log_with="wandb")
hps = {"learning_rate": args.learning_rate}
accelerator.init_trackers(args.wandb_name)
set_random_seed(args.seed)
tokenizer = LlamaTokenizer.from_pretrained(args.model_name_or_path,
fast_tokenizer=True)
tokenizer.pad_token_id = ( 0)
tokenizer.padding_side = "left"
# Load model
model = LlamaForCausalLM.from_pretrained(args.model_name_or_path)
# to change some parameters, and freeze it.
layer_name = "model.layers.11.input_layernorm.weight"
# Loop through all named parameters to freeze the target parameter
for name, param in model.named_parameters():
if name == "model.layers.11.input_layernorm.weight":
param.requires_grad = False
break # Stop the loop once the target parameter is found and modified
# to ensure the parameter is frozen
parameter_frozen = False
for name, param in model.named_parameters():
if name == "model.layers.11.input_layernorm.weight":
parameter_frozen = not param.requires_grad # This should be True if the parameter is frozen
# Assert to check if the parameter is indeed frozen
assert parameter_frozen, f"Parameter {layer_name} is not frozen"
with accelerator.main_process_first():
train_dataset,eval_dataset = create_dataset(
args.local_rank, # invalid
args.data_output_path,
args.seed,
args.model_name_or_path,
args.max_seq_len,
)
accelerator.wait_for_everyone()
train_dataloader = DataLoader(train_dataset,
collate_fn=DataCollatorForSeq2Seq(tokenizer,
pad_to_multiple_of=8,
return_tensors="pt",
padding=True),
batch_size=args.per_device_train_batch_size)
eval_dataloader = DataLoader(eval_dataset,
collate_fn=DataCollatorForSeq2Seq(tokenizer,
pad_to_multiple_of=8,
return_tensors="pt",
padding=True),
batch_size=args.per_device_eval_batch_size)
print(f'length of en_train: {len(train_dataloader)}\n{len(eval_dataloader)}')
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": args.weight_decay,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
# Adam Optimizer
optimizer_cls = (
torch.optim.AdamW
if accelerator.state.deepspeed_plugin is None
or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
else DummyOptim
)
optimizer = optimizer_cls(optimizer_grouped_parameters, lr=args.learning_rate)
num_update_steps_per_epoch = math.ceil(
len(train_dataloader) / args.gradient_accumulation_steps)
lr_scheduler = get_scheduler(
name=args.lr_scheduler_type,
optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps,
num_training_steps=args.num_train_epochs * num_update_steps_per_epoch,
)
model, train_dataloader, eval_dataloader, optimizer, lr_scheduler = accelerator.prepare(
model, train_dataloader, eval_dataloader, optimizer, lr_scheduler)
# Train!
print_rank_0("***** Running training *****", accelerator.process_index)
for epoch in range(args.num_train_epochs):
#for epoch in range(start_epoch, args.num_train_epochs):
current_step = []
model.train()
for step, batch in enumerate(train_dataloader):
outputs = model(**batch, use_cache=False)
train_loss = outputs.loss
accelerator.backward(train_loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
accelerator.log({"train_loss": train_loss})
accelerator.log({"lr": lr_scheduler.get_lr()[0]})
if step % 400 == 0:
print_rank_0(f"Epoch is {epoch}, Step is {step}, train_loss is {train_loss.item()}", accelerator.process_index)
for name, param in model.named_parameters():
print(name, param.size())
print_trainable_parameters(model)
ppl, eval_loss = evaluate(args, model, eval_dataloader, accelerator, eval_dataset)
accelerator.log({"eval_loss": eval_loss})
if accelerator.is_main_process:
print_rank_0(f"eval_loss: {eval_loss}, ppl: {ppl}", accelerator.process_index)
if args.output_dir is not None:
epoch_output_dir = os.path.join(args.output_dir, f"epoch_{epoch}_eval_loss_{eval_loss:.4f}")
os.makedirs(epoch_output_dir, exist_ok=True)
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(
epoch_output_dir,
#args.output_dir_huggingface,
is_main_process=accelerator.is_main_process,
save_function=accelerator.save,
state_dict=accelerator.get_state_dict(model),
)
if accelerator.is_main_process:
tokenizer.save_pretrained(epoch_output_dir)
#tokenizer.save_pretrained(args.output_dir_huggingface)
accelerator.end_training()
if __name__ == "__main__":
main()
The text was updated successfully, but these errors were encountered:
Hi @subramen, I'm trying to perturb some dimensions of parameter values in some layers (like 0-5 layers's all parameter) and freeze it, then do a full parameter fintuning.
I got 2 puzzling problems, the 1st is after loading the pre-trained model, I print all the
model.named_parameters
andeach param size
, and it all shows to be torch. size [0], while after the training starts, I print it again, and found as below, onlyinput_layernorm.weight
andpost_attention_layernorm.weight
will update in each layer when fine-tuning, but parameters likeself_attn q, k, v,o
not. is this behavior normal?(p.s. I tried both on Llama and llama 2, got the same results.)
Also, the code I used are shown in the bottom. Thanks in advance.
model.named_parameters and param.size
:`my code for fientuning':
The text was updated successfully, but these errors were encountered: