NOTE: the dataset composition and processing, as well as LoRA and quantization models, were constructed/experimented with in the same notebook. To be able to run QAT and LoRA finetuning at the same time (for speed), we created a duplicate notebook and ran QAT in one session and LoRA in the other. As such, the prints/outputs are separated and we include the LoRA code and outputs (but not the preceding imports/installs, model loading, dataset construction, etc., which can already be found in 'cleaned_quant_experiments.ipynb') below here:

### LoRA

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=32, lora_alpha=32, lora_dropout=0.1)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 9,175,040 || all params: 3,221,924,864 || trainable%: 0.2848


In [None]:
import time
from torch.profiler import profile, record_function, ProfilerActivity
import numpy

#clean the slate in prep for memory profiling later
torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()

#we originally tried profiling with Pytorch profiler, but it appears to introduce overhead that knocked the session out with an OOM error. As such, we took it out and profiled the way that was done in our homeworks and lecture.
# with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=True) as prof:
#   with record_function("model_training"):

#hyperparams
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
criterion = torch.nn.CrossEntropyLoss()
num_epochs = 15
artificial_len = 150

#times to track (key=epoch, value=time in sec)
total_runtime = {}
t_train_time = {}
t_data_time = {}
t_epoch_losses = {}

#training mode
model.train()

###TRAINING LOOP ###
for epoch in range(num_epochs):

  training_time = 0
  dataloading_time = 0
  epoch_loss = 0

  ### START TIMING FOR TOTAL_RUNTIME
  torch.cuda.synchronize()
  start_totalruntime_timer = time.perf_counter()

  ### START DATALOADING TIME
  start_dataloading_timer = time.perf_counter()

  for idx, sample in enumerate(agg_dl_train):

    ### END DATALOADING timer and accumulate total
    dataloading_time += time.perf_counter()-start_dataloading_timer

    inputs = sample['input_ids'][:, :artificial_len].to(device)
    mask = sample['attention_mask'][:, :artificial_len].to(device)
    labels = torch.Tensor(sample['label'][:, :artificial_len]).to(device).long()

    ### START TIMING FOR TRAINING TIME
    torch.cuda.synchronize()
    start_training_timer = time.perf_counter()
    ###

    optimizer.zero_grad()

    outputs = model(inputs, attention_mask=mask, labels=labels)
    loss = outputs.loss

    loss.backward()
    optimizer.step()

    ### END TIMING FOR TRAINING TIME
    torch.cuda.synchronize()
    training_time += time.perf_counter()-start_training_timer

    #print for myself
    if idx%100 == 0:
      print(f"Epoch {epoch}, Iteration {idx}, Loss: ", loss.item())

    epoch_loss += loss.item()

    #start dataloading timer again for the next batch load
    start_dataloading_timer = time.perf_counter()

  #print for myself (after each epoch)
  print(f"Epoch {epoch}, Loss -- {epoch_loss}")

  #END TIMING FOR TOTAL RUNTIME
  torch.cuda.synchronize()
  total_runtime[epoch] = time.perf_counter()-start_totalruntime_timer

  #log other times
  t_train_time[epoch] = training_time
  t_data_time[epoch] = dataloading_time
  t_epoch_losses[epoch] = epoch_loss

#print stats
print(f"""

total run time: {total_runtime.items()}
train time: {t_train_time.items()}
dataloading time: {t_data_time.items()}

loss/epoch: {t_epoch_losses.items()}

""")

Epoch 0, Iteration 0, Loss:  11.398344039916992
Epoch 0, Iteration 100, Loss:  11.220364570617676
Epoch 0, Iteration 200, Loss:  11.503702163696289
Epoch 0, Loss -- 2494.401261329651
Epoch 1, Iteration 0, Loss:  11.811957359313965
Epoch 1, Iteration 100, Loss:  11.175226211547852
Epoch 1, Iteration 200, Loss:  10.693321228027344
Epoch 1, Loss -- 2434.6326961517334
Epoch 2, Iteration 0, Loss:  11.06300163269043
Epoch 2, Iteration 100, Loss:  10.465377807617188
Epoch 2, Iteration 200, Loss:  10.387786865234375
Epoch 2, Loss -- 2378.03738117218
Epoch 3, Iteration 0, Loss:  10.760562896728516
Epoch 3, Iteration 100, Loss:  10.066658973693848
Epoch 3, Iteration 200, Loss:  10.007583618164062
Epoch 3, Loss -- 2291.6309175491333
Epoch 4, Iteration 0, Loss:  10.298853874206543
Epoch 4, Iteration 100, Loss:  10.112360000610352
Epoch 4, Iteration 200, Loss:  9.586567878723145
Epoch 4, Loss -- 2117.913501739502
Epoch 5, Iteration 0, Loss:  8.512096405029297
Epoch 5, Iteration 100, Loss:  8.029033

In [None]:
#note peak memory usage stat
peak = torch.cuda.max_memory_allocated()
print(f"PEAK GPU MEM USAGE: {peak / 1e6:.2f} MB")

PEAK GPU MEM USAGE: 33933.41 MB


In [None]:
#save the model
model.save_pretrained("./peft_ft_model")