Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parameters perturb and full finetuning #1057

Open
waterluck opened this issue Mar 5, 2024 · 0 comments
Open

Parameters perturb and full finetuning #1057

waterluck opened this issue Mar 5, 2024 · 0 comments
Assignees

Comments

@waterluck
Copy link

waterluck commented Mar 5, 2024

Hi @subramen, I'm trying to perturb some dimensions of parameter values in some layers (like 0-5 layers's all parameter) and freeze it, then do a full parameter fintuning.

I got 2 puzzling problems, the 1st is after loading the pre-trained model, I print all the model.named_parameters and each param size, and it all shows to be torch. size [0], while after the training starts, I print it again, and found as below, only input_layernorm.weight and post_attention_layernorm.weight will update in each layer when fine-tuning, but parameters like self_attn q, k, v,o not. is this behavior normal?
(p.s. I tried both on Llama and llama 2, got the same results.)

Also, the code I used are shown in the bottom. Thanks in advance.

model.named_parameters and param.size:

20315 module.model.layers.9.self_attn.q_proj.weight torch.Size([0])
20316 module.model.layers.9.self_attn.k_proj.weight torch.Size([0])
20317 module.model.layers.9.self_attn.v_proj.weight torch.Size([0])
20318 module.model.layers.9.self_attn.o_proj.weight torch.Size([0])
20319 module.model.layers.9.mlp.gate_proj.weight torch.Size([0])
20320 module.model.layers.9.mlp.up_proj.weight torch.Size([0])
20321 module.model.layers.9.mlp.down_proj.weight torch.Size([0])
20322 module.model.layers.9.input_layernorm.weight torch.Size([5120])
20323 module.model.layers.9.post_attention_layernorm.weight torch.Size([5120])
20324 module.model.layers.10.self_attn.q_proj.weight torch.Size([0])
20325 module.model.layers.10.self_attn.k_proj.weight torch.Size([0])
20326 module.model.layers.10.self_attn.v_proj.weight torch.Size([0])
20327 module.model.layers.10.self_attn.o_proj.weight torch.Size([0])
20328 module.model.layers.10.mlp.gate_proj.weight torch.Size([0])
20329 module.model.layers.10.mlp.up_proj.weight torch.Size([0])
20330 module.model.layers.10.mlp.down_proj.weight torch.Size([0])
20331 module.model.layers.10.input_layernorm.weight torch.Size([5120])
20332 module.model.layers.10.post_attention_layernorm.weight torch.Size([5120])

`my code for fientuning':

    accelerator = Accelerator(log_with="wandb")

    hps = {"learning_rate": args.learning_rate}
    accelerator.init_trackers(args.wandb_name)

    set_random_seed(args.seed)

    tokenizer = LlamaTokenizer.from_pretrained(args.model_name_or_path,
                                               fast_tokenizer=True)
    tokenizer.pad_token_id = ( 0)
    tokenizer.padding_side = "left"
    
   # Load model
    model = LlamaForCausalLM.from_pretrained(args.model_name_or_path)

    # to change some parameters, and freeze it. 
    layer_name = "model.layers.11.input_layernorm.weight"

    # Loop through all named parameters to freeze the target parameter
    for name, param in model.named_parameters():
        if name == "model.layers.11.input_layernorm.weight":
            param.requires_grad = False
            break  # Stop the loop once the target parameter is found and modified

    # to ensure the parameter is frozen
    parameter_frozen = False
    for name, param in model.named_parameters():
        if name == "model.layers.11.input_layernorm.weight":
            parameter_frozen = not param.requires_grad  # This should be True if the parameter is frozen

    # Assert to check if the parameter is indeed frozen
    assert parameter_frozen, f"Parameter {layer_name} is not frozen"
        with accelerator.main_process_first():
        train_dataset,eval_dataset  = create_dataset(
            args.local_rank, # invalid
            args.data_output_path,
            args.seed,
            args.model_name_or_path,
            args.max_seq_len,
        )   

    accelerator.wait_for_everyone()

    train_dataloader = DataLoader(train_dataset,
                                   collate_fn=DataCollatorForSeq2Seq(tokenizer,
                                   pad_to_multiple_of=8,
                                   return_tensors="pt",
                                   padding=True),
                                  batch_size=args.per_device_train_batch_size)

    eval_dataloader = DataLoader(eval_dataset,
                                collate_fn=DataCollatorForSeq2Seq(tokenizer,
                                pad_to_multiple_of=8,
                                return_tensors="pt",
                                padding=True),
                                batch_size=args.per_device_eval_batch_size)

    print(f'length of en_train: {len(train_dataloader)}\n{len(eval_dataloader)}')

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

    # Adam Optimizer
    optimizer_cls = (
        torch.optim.AdamW
        if accelerator.state.deepspeed_plugin is None
        or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
        else DummyOptim
    )

    optimizer = optimizer_cls(optimizer_grouped_parameters, lr=args.learning_rate)

    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.num_train_epochs * num_update_steps_per_epoch,
    )

    model, train_dataloader, eval_dataloader, optimizer, lr_scheduler = accelerator.prepare(
        model, train_dataloader, eval_dataloader, optimizer, lr_scheduler)


    # Train!
    print_rank_0("***** Running training *****", accelerator.process_index)
    
    for epoch in range(args.num_train_epochs):
    #for epoch in range(start_epoch, args.num_train_epochs):
        current_step = []
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch, use_cache=False)
            train_loss = outputs.loss
            accelerator.backward(train_loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            accelerator.log({"train_loss": train_loss})
            accelerator.log({"lr": lr_scheduler.get_lr()[0]})
            if step % 400 == 0:
                print_rank_0(f"Epoch is {epoch}, Step is {step}, train_loss is {train_loss.item()}", accelerator.process_index)
            for name, param in model.named_parameters():
                print(name, param.size())
            print_trainable_parameters(model)
        
        ppl, eval_loss = evaluate(args, model, eval_dataloader, accelerator, eval_dataset)
        accelerator.log({"eval_loss": eval_loss})

        if accelerator.is_main_process:
            print_rank_0(f"eval_loss: {eval_loss}, ppl: {ppl}", accelerator.process_index)
     

        if args.output_dir is not None:

            epoch_output_dir = os.path.join(args.output_dir, f"epoch_{epoch}_eval_loss_{eval_loss:.4f}")
            os.makedirs(epoch_output_dir, exist_ok=True)

            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)

            unwrapped_model.save_pretrained(
                epoch_output_dir,
                #args.output_dir_huggingface,
                is_main_process=accelerator.is_main_process,
                save_function=accelerator.save,
                state_dict=accelerator.get_state_dict(model),
            )
            if accelerator.is_main_process:
                tokenizer.save_pretrained(epoch_output_dir)
                #tokenizer.save_pretrained(args.output_dir_huggingface)
            

    accelerator.end_training()

if __name__ == "__main__":
    main()
@subramen subramen self-assigned this Mar 20, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants