In [None]:
#imports and dependencies
import torch
import torch.quantization
!pip install datasets
from datasets import load_dataset
from torch.utils.data import DataLoader
import random
import multiprocessing
import os
import re
from transformers import AutoTokenizer, AutoModelForCausalLM

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

# Setup and utilities

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
The token `hpml` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `hpml`


In [None]:
# Load model directly
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
#original memory usage for llama 3.2 3b
torch.save(model.state_dict(), "temp.p")
print('Size (MB):', os.path.getsize("temp.p")/1e6)
os.remove('temp.p')

Size (MB): 12851.09892


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = 'cpu'
model.to(device)
print(device)

cuda


In [None]:
class benchmark_dataset(torch.utils.data.Dataset):
  '''formats the data for dataloader'''

  def __init__(self, input, labels, tokenizer, filter_n=150):
    '''constructor for taking in the unprocessed data'''

    self.input = input
    self.labels = labels
    self.tokenizer = tokenizer

    self.filter_len(filter_n) #filters for samples whose tokenized lengths are <= the filter_n length and can be accurately parsed using our regex function

  def filter_len(self, n):
    '''performs the filtering to create a new subset that fits the filter criteria'''
    new_input = []
    new_label = []

    for q, a in zip(self.input, self.labels):

      #checks that we can accurately parse out the ground-truth answer from the sample
      matches = re.findall(r'\\boxed{([^}]*)}', a)
      if len(matches) <= 0:
        continue
      
      #filters for samples that fit within the specified token length limit
      tk_len_q = len(tokenizer(str(q), return_tensors='pt')['input_ids'][0])
      tk_len_a = len(tokenizer(str(a), return_tensors='pt')['input_ids'][0])

      if tk_len_q <= n and tk_len_a <= n:
        new_input.append(q)
        new_label.append(a)

    #courtesy print so we can check how large the new subset is and what it looks like
    print(f"""
    Len of Original Input: {len(self.input)}
    Len of Original Labels: {len(self.labels)}
    Len of New_Input: {len(new_input)}
    Len of New_Label: {len(new_label)}

    Sample Input, Label: {new_input[0], new_label[0]}

    """)

    self.input = new_input
    self.labels = new_label

  def __len__(self):
    return len(self.input)

  def __getitem__(self, idx):
    
    return {"question": self.input[idx], "answer": self.labels[idx]}


In [None]:
# NOTE: this is an older function that was used when we were experiemnting with composite datasets that included multiple choice qs outside of just MATH. Not used for the final official training and inference code. 
def format_for_mm(question, choices):
  '''
  Formats questions and choices into one multiple-choice-question string
  '''
  return [f"""Choose the choice that best answer the following question:
  Question:
  {q.strip()}
  Choices:
  {c}
  """
  for q, c in zip(question, choices)]

In [None]:
#NOTE: the below is a function that was used during an older iteration of experiments with composite datasets to better understand the distribution of sample token lengths
#this was especially important because we wished to have a fairly balanced distribution of problem content but also needed to be able to fit samples onto GPU memory with the model for 
#fine-tuning runs with QAT and LoRA. 

# #to determine the max length of each sample token sequence (below), i'm going to take a quick look
# #at the distributions
# import matplotlib.pyplot as plt

# def plot_data_dist(ds, input=True):
#   '''
#   plot the token length distribution of inputs in a dataset to understand where to truncate
#   bc for qat, i dont have enough memory to feed the entire input
#   esp. in a left skew case, which most of these datasets entail, makes sense to cut off the long right tail
#   '''
#   tk_len = []
#   if input:
#     for sample in ds:
#       tk_len.append(len(tokenizer(str(sample['question']), return_tensors='pt')['input_ids'][0]))

#     plt.hist(tk_len)
#     plt.show()

#   else:
#     for sample in ds:
#       tk_len.append(len(tokenizer(str(sample['answer']), return_tensors='pt')['input_ids'][0]))

#     plt.hist(tk_len)
#     plt.show()

#   return tk_len #return tk_len as courtesy for further examination if needed

# math_tklen = plot_data_dist(math_train)
# gpqa_tklen = plot_data_dist(gpqa)
# mmlu_tklen = plot_data_dist(mmlu_train)

# math_tklen = plot_data_dist(math_train, input=False)
# gpqa_tklen = plot_data_dist(gpqa, input=False)
# mmlu_tklen = plot_data_dist(mmlu_train, input=False)

In [None]:
def collate_fn_qat(batch):
    '''collator function used by the dataloader to properly batch the dataset samples'''
    inputs = tokenizer([i['question'] for i in batch], return_tensors='pt', padding='max_length', truncation=True, max_length=150)
    labels = tokenizer([str(i['answer']) for i in batch], return_tensors='pt', padding='max_length', truncation=True, max_length=150)

    return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'label': labels['input_ids']}


In [None]:
#NOTE: the below contains code that was used during an older iteration of experiments with composite datasets. The final iteration of this project (all uncommented code) only uses the MATH benchmark.

#gpqa (for reasoning)
# gpqa_raw = load_dataset("Idavidrein/gpqa", "gpqa_diamond")
# gpqa_choices = [[a, b, c, d] for a, b, c, d in
#                 zip(gpqa_raw['train']['Correct Answer'], gpqa_raw['train']['Incorrect Answer 1'],
#                     gpqa_raw['train']['Incorrect Answer 2'], gpqa_raw['train']['Incorrect Answer 3'])]
# for choices in gpqa_choices:
#   random.shuffle(choices)

# gpqa_questions_proc = format_for_mm(gpqa_raw['train']['Question'], gpqa_choices)
# gpqa = benchmark_dataset(gpqa_questions_proc, gpqa_raw['train']['Correct Answer'])

#note: there is no test set for gpqa, so i take a subset of the train set instead (80%), leaving the other subset for testing

# gpqa_dl_train = DataLoader(gpqa[:158], batch_size=64, shuffle=True, collate_fn=collate_fn)
# gpqa_dl_test = DataLoader(gpqa[158:], batch_size=64, shuffle=True, collate_fn=collate_fn)

#math (for math)
math_raw = load_dataset("lighteval/MATH", "all")
# math_train = benchmark_dataset(math_raw['train']['problem'], math_raw['train']['solution'])
# math_dl_train = DataLoader(math_train, batch_size=64, shuffle=True, collate_fn=collate_fn)

# math_test = benchmark_dataset(math_raw['test']['problem'], math_raw['test']['solution'])
# math_dl_test = DataLoader(math_test, batch_size=64, shuffle=True, collate_fn=collate_fn)

# #mmlu (for gen knowledge + reasoning)
# mmlu_raw = load_dataset("cais/mmlu", "all")
# mmlu_questions_proc_train = format_for_mm(mmlu_raw['auxiliary_train']['question'], mmlu_raw['auxiliary_train']['choices'])
# mmlu_train = benchmark_dataset(mmlu_questions_proc_train, mmlu_raw['auxiliary_train']['answer'])
# mmlu_dl_train = DataLoader(mmlu_train, batch_size=64, shuffle=True, collate_fn=collate_fn)

# mmlu_questions_proc_test = format_for_mm(mmlu_raw['test']['question'], mmlu_raw['test']['choices'])
# mmlu_test = benchmark_dataset(mmlu_questions_proc_test, mmlu_raw['test']['answer'])
# mmlu_dl_test = DataLoader(mmlu_test, batch_size=64, shuffle=True, collate_fn=collate_fn)


#master list - train
# sublist_input_train = gpqa_questions_proc[:158] + math_raw['train']['problem'] + mmlu_questions_proc_train
# sublist_answer_train = gpqa_raw['train']['Correct Answer'][:158] + math_raw['train']['solution'] + mmlu_raw['auxiliary_train']['answer']
agg_train_set = benchmark_dataset(math_raw['train']['problem'], math_raw['train']['solution'], tokenizer)
agg_dl_train = DataLoader(agg_train_set, batch_size=2, shuffle=True, collate_fn=collate_fn_qat)

#master list - test
# sublist_input_test = gpqa_questions_proc[158:] + math_raw['test']['problem'] + mmlu_questions_proc_test
# sublist_answer_test = gpqa_raw['train']['Correct Answer'][158:] + math_raw['test']['solution'] + mmlu_raw['test']['answer']
agg_test_set = benchmark_dataset(math_raw['test']['problem'], math_raw['test']['solution'], tokenizer)
agg_dl_test = DataLoader(agg_test_set, batch_size=2, shuffle=True, collate_fn=collate_fn_qat)



    Len of Original Input: 7500
    Len of Original Labels: 7500
    Len of New_Input: 3432
    Len of New_Label: 3432

    Sample Input, Label: ('Let \\[f(x) = \\left\\{\n\\begin{array}{cl} ax+3, &\\text{ if }x>2, \\\\\nx-5 &\\text{ if } -2 \\le x \\le 2, \\\\\n2x-b &\\text{ if } x <-2.\n\\end{array}\n\\right.\\]Find $a+b$ if the piecewise function is continuous (which means that its graph can be drawn without lifting your pencil from the paper).', 'For the piecewise function to be continuous, the cases must "meet" at $2$ and $-2$. For example, $ax+3$ and $x-5$ must be equal when $x=2$. This implies $a(2)+3=2-5$, which we solve to get $2a=-6 \\Rightarrow a=-3$. Similarly, $x-5$ and $2x-b$ must be equal when $x=-2$. Substituting, we get $-2-5=2(-2)-b$, which implies $b=3$. So $a+b=-3+3=\\boxed{0}$.')

    

    Len of Original Input: 5000
    Len of Original Labels: 5000
    Len of New_Input: 2419
    Len of New_Label: 2419

    Sample Input, Label: ('How many vertical asymptotes does

### PTQ using torchao

In [None]:
!pip install --upgrade torch torchao
!pip install --pre --upgrade torch torchvision torchao --index-url https://download.pytorch.org/whl/nightly/cu121 
!pip install --pre --upgrade torchtune --extra-index-url https://download.pytorch.org/whl/nightly/cpu

Collecting torchao
  Downloading torchao-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading torchao-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchao
Successfully installed torchao-0.6.1
Looking in indexes: https://download.pytorch.org/whl/nightly/cu121
Collecting torch
  Downloading https://download.pytorch.org/whl/nightly/cu121/torch-2.6.0.dev20241112%2Bcu121-cp310-cp310-linux_x86_64.whl (767.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m767.9/767.9 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting torchao
  Downloading https://download.pytorch.org/whl/nightly/cu121/torchao-0.7.0.dev20241112%2Bcu121-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [None]:
import torchao
import copy
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer

In [None]:
#subapproach #1 - using torchao directly
from torchao.quantization.quant_api import (
    quantize_,
    int8_dynamic_activation_int8_weight,
    int4_weight_only,
    int8_weight_only
)
model.to(device)
quantize_(model, int8_weight_only()) #quantize to int8 weights

#double check that we see some expected memory savings
torch.save(model.state_dict(), "temp.p")
print('Size (MB):', os.path.getsize("temp.p")/1e6)
os.remove('temp.p')

Size (MB): 4800.333753


In [None]:
#double check that generation works
input = tokenizer("hello", return_tensors='pt')['input_ids'].to('cuda')
tokenizer.decode(model.generate(input)[0]) #quick test check

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


"<|begin_of_text|>hello! I'm new to this forum and I'm excited to learn and share my knowledge with"

In [None]:
#save the model
model.save_pretrained("./ptq_int8", safe_serialization=False)

In [None]:
#double check that the model can be loaded in
model = AutoModelForCausalLM.from_pretrained("./ptq_int8", device_map="cuda")
model = torch.compile(model, mode="max-autotune")

#and fits the expected memory requirements
torch.save(model.state_dict(), "temp.p")
print('Size (MB):', os.path.getsize("temp.p")/1e6)
os.remove('temp.p')

#double check generatiom works
input = tokenizer("hello", return_tensors='pt')['input_ids'].to('cuda')
tokenizer.decode(model.generate(input)[0]) #quick test check

Size (MB): 4404.79734


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'<|begin_of_text|>hello everyone, welcome back to my channel. today i want to talk about the importance of self'

In [None]:
#subapproach #2 - for int4 weights (still torchao, just a slightly diff way of doing it)

quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
quantized_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct", device_map="auto", quantization_config=quantization_config)

torchao.quantization.utils.recommended_inductor_config_setter()
quantized_model = torch.compile(quantized_model, mode="max-autotune")

quantized_model.save_pretrained("./quant_int4", safe_serialization=False)

loaded_quantized_model = AutoModelForCausalLM.from_pretrained("./quant_int4", device_map="cuda")

loaded_quantized_model = torch.compile(loaded_quantized_model, mode="max-autotune")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
#double check that we see some expected memory savings
torch.save(loaded_quantized_model.state_dict(), "temp.p")
print('Size (MB):', os.path.getsize("temp.p")/1e6)
os.remove('temp.p')

Size (MB): 2285.908892


In [None]:
#double check that generation works
input = tokenizer("hello", return_tensors='pt')['input_ids'].to('cuda')
tokenizer.decode(loaded_quantized_model.generate(input)[0]) #quick test check

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'<|begin_of_text|>hello = "world"\n\ndef hello_world():\n    global hello\n    print(hello)\n\nhello'

### QAT with torchao

In [None]:
!pip install torchao torchtune #if ptq section was run earlier, no need to pip install again

Collecting torchao
  Downloading torchao-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting torchtune
  Downloading torchtune-0.4.0-py3-none-any.whl.metadata (19 kB)
Collecting tiktoken (from torchtune)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting blobfile>=2 (from torchtune)
  Downloading blobfile-3.0.0-py3-none-any.whl.metadata (15 kB)
Collecting omegaconf (from torchtune)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pycryptodomex>=3.8 (from blobfile>=2->torchtune)
  Downloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting antlr4-python3-runtime==4.9.* (from omegaconf->torchtune)
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m


In [None]:
#IF PTQ was run before, need a fresh unquant instruct base model again -- otherwise, skip

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import torchao
from torchtune.training.quantization import Int8DynActInt4WeightQATQuantizer
import time
#we originally tried profiling with Pytorch profiler, but it appears to introduce overhead that knocked the session out with an OOM error. As such, we took it out and profiled the way that was done in our homeworks and lecture.
# from torch.profiler import profile, record_function, ProfilerActivity 

#clean the slate in prep for memory profiling later
torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()

# model.to('cuda') #only needed if model was not moved to cuda earlier

#prep for quantization
qat_quantizer = Int8DynActInt4WeightQATQuantizer()
model = qat_quantizer.prepare(model)

#hyperparams
optimizer = torch.optim.SGD(model.parameters(), lr=0.00001) #im keeping the lr small bc this is for fine-tuning
criterion = torch.nn.CrossEntropyLoss()
num_epochs = 20
artificial_len = 150

#times to track (key=epoch, value=time in sec)
total_runtime = {}
t_train_time = {}
t_data_time = {}
t_epoch_losses = {}

#training mode
model.train()

###TRAINING LOOP ###
for epoch in range(num_epochs):

  training_time = 0
  dataloading_time = 0
  epoch_loss = 0

  ### START TIMING FOR TOTAL_RUNTIME
  torch.cuda.synchronize()
  start_totalruntime_timer = time.perf_counter()

  ### START DATALOADING TIME
  start_dataloading_timer = time.perf_counter()

  for idx, sample in enumerate(agg_dl_train):

    ### END DATALOADING timer and accumulate total
    dataloading_time += time.perf_counter()-start_dataloading_timer

    inputs = sample['input_ids'][:, :artificial_len].to(device)
    mask = sample['attention_mask'][:, :artificial_len].to(device)
    labels = torch.Tensor(sample['label'][:, :artificial_len]).to(device).long()

    ### START TIMING FOR TRAINING TIME
    torch.cuda.synchronize()
    start_training_timer = time.perf_counter()
    ###

    optimizer.zero_grad()

    outputs = model(inputs, attention_mask=mask, labels=labels)
    loss = outputs.loss

    loss.backward()
    optimizer.step()

    ### END TIMING FOR TRAINING TIME
    torch.cuda.synchronize()
    training_time += time.perf_counter()-start_training_timer

    #print for myself
    if idx%100 == 0:
      print(f"Epoch {epoch}, Iteration {idx}, Loss: ", loss.item())

    epoch_loss += loss.item()

    #start dataloading timer again for the next batch load
    start_dataloading_timer = time.perf_counter()

  #print for myself (after each epoch)
  print(f"Epoch {epoch}, Loss -- {epoch_loss}")

  #END TIMING FOR TOTAL RUNTIME
  torch.cuda.synchronize()
  total_runtime[epoch] = time.perf_counter()-start_totalruntime_timer

  #log other times
  t_train_time[epoch] = training_time
  t_data_time[epoch] = dataloading_time
  t_epoch_losses[epoch] = epoch_loss

peak = torch.cuda.max_memory_allocated() #get the max amount of memory used during the period

#print stats
print(f"""

total run time: {total_runtime.items()}
train time: {t_train_time.items()}
dataloading time: {t_data_time.items()}

loss/epoch: {t_epoch_losses.items()}

""")

print(f"PEAK GPU MEM USAGE: {peak / 1e6:.2f} MB")


Epoch 0, Iteration 0, Loss:  11.262016296386719
Epoch 0, Iteration 100, Loss:  5.651309967041016
Epoch 0, Iteration 200, Loss:  6.902235507965088
Epoch 0, Iteration 300, Loss:  5.925531387329102
Epoch 0, Iteration 400, Loss:  2.8556277751922607
Epoch 0, Iteration 500, Loss:  3.9058032035827637
Epoch 0, Iteration 600, Loss:  3.9291083812713623
Epoch 0, Iteration 700, Loss:  3.3602418899536133
Epoch 0, Iteration 800, Loss:  2.896789789199829
Epoch 0, Iteration 900, Loss:  3.2548961639404297
Epoch 0, Iteration 1000, Loss:  3.305786371231079
Epoch 0, Iteration 1100, Loss:  4.892472267150879
Epoch 0, Iteration 1200, Loss:  2.6425411701202393
Epoch 0, Iteration 1300, Loss:  4.221468925476074
Epoch 0, Iteration 1400, Loss:  3.5106353759765625
Epoch 0, Iteration 1500, Loss:  3.9415643215179443
Epoch 0, Iteration 1600, Loss:  3.254051685333252
Epoch 0, Iteration 1700, Loss:  4.356900215148926
Epoch 0, Loss -- 7277.931604385376
Epoch 1, Iteration 0, Loss:  3.4747531414031982
Epoch 1, Iteration 1

In [None]:
#convert to quantized
model = qat_quantizer.convert(model)

In [None]:
#save the model
model.save_pretrained("./qat_int8", safe_serialization=False)

### LoRA

NOTE: the dataset composition and processing, as well as LoRA and quantization models, were constructed/experimented with in the same notebook. To be able to run QAT and LoRA finetuning at the same time (for speed), we created a duplicate notebook and ran QAT in one session and LoRA in the other. 

As such, the prints/outputs are separated and we show the LoRA code and outputs in the 'lora_snippet.ipynb' file (same directory as this file)