In [14]:
# this notebook is used to test time performance of inference with different settings (prompt engineering, batch size)
# also the accuracy performance of fine-tuning 0-499 will be tested here.

In [1]:
# load the model
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [1]:
# load the model
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "ll3_fitting/epoch_0",
    max_seq_length = 4096,
    dtype = None,
    load_in_4bit = True
)

FastLanguageModel.for_inference(model)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.988 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [2]:
#load the accuracy test dataset
import pickle
with open('data/llama3acctest.pkl', 'rb') as f:
    acctest_dataset = pickle.load(f)

In [3]:
acctest_dataset[:2]

[[{'role': 'system',
   'content': "You are a personal judge of video game, your role is to judge which game is preferred by the user based on user's feedbacks of two games. Simply reply with prefered game's name, no need for explanation"},
  {'role': 'user',
   'content': "I played two games factorio, and war for the overworld. After playing, I gave reviews for both games as follow:\nfactorio: EDIT - Several years later. Its still amazing, still worth a purchase even at full price.Purchased from their own site way back when. A game not to be missed.\nwar for the overworld: Shouldn't be out of BETA yet, still a buggy mess.Looks like the original though so got that going for it.  Worth a purchase when they sort everything out but shouldn't of been released yet.  If only I could take back my Kickstarter."},
  {'role': 'assistant', 'content': 'factorio'}],
 [{'role': 'system',
   'content': "You are a personal judge of video game, your role is to judge which game is preferred by the user 

In [4]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = 'llama-3'
)

NameError: name 'tokenizer' is not defined

In [None]:
import time

In [10]:
#inference test batch size = 1
start = time.time()
for i in range(128):
    inputs = tokenizer.apply_chat_template(
        acctest_dataset[i],
        tokenize = True,
        add_generation_prompt = True,
        return_tensors = "pt"
    ).to('cuda')
    
    outputs = model.generate(input_ids = inputs, max_new_tokens = 128, use_cache = True)

print(time.time() - start)

63.68760132789612


In [5]:
tokenizer.pad_token = '<|reserved_special_token_250|>'
tokenizer.pad_token_id = 128255

In [13]:
#batch size = 2
start = time.time()
for i in range(64):
    inputs = tokenizer.apply_chat_template(
        acctest_dataset[2*i:2*i+2],
        tokenize = True,
        padding = True,
        truncation = False,
        add_generation_prompt = True,
        return_tensors = 'pt').to('cuda')
    attention_mask = (inputs != tokenizer.pad_token_id).int()
    outputs = model.generate(input_ids = inputs, attention_mask = attention_mask, max_new_tokens = 256, use_cache = True)

print(time.time() - start)

71.90847945213318


In [15]:
#batch size = 4
start = time.time()
for i in range(32):
    inputs = tokenizer.apply_chat_template(
        acctest_dataset[4*i:4*i+4],
        tokenize = True,
        padding = True,
        truncation = False,
        add_generation_prompt = True,
        return_tensors = 'pt').to('cuda')
    attention_mask = (inputs != tokenizer.pad_token_id).int()
    outputs = model.generate(input_ids = inputs, attention_mask = attention_mask, max_new_tokens = 512, use_cache = True)

print(time.time() - start)

50.566630125045776


In [16]:
#batch size = 8
start = time.time()
for i in range(16):
    inputs = tokenizer.apply_chat_template(
        acctest_dataset[8*i:8*i+8],
        tokenize = True,
        padding = True,
        truncation = False,
        add_generation_prompt = True,
        return_tensors = 'pt').to('cuda')
    attention_mask = (inputs != tokenizer.pad_token_id).int()
    outputs = model.generate(input_ids = inputs, attention_mask = attention_mask, max_new_tokens = 1024, use_cache = True)

print(time.time() - start)

45.72507953643799


In [17]:
#batch size = 16
start = time.time()
for i in range(8):
    inputs = tokenizer.apply_chat_template(
        acctest_dataset[16*i:16*i+16],
        tokenize = True,
        padding = True,
        truncation = False,
        add_generation_prompt = True,
        return_tensors = 'pt').to('cuda')
    attention_mask = (inputs != tokenizer.pad_token_id).int()
    outputs = model.generate(input_ids = inputs, attention_mask = attention_mask, max_new_tokens = 2048, use_cache = True)

print(time.time() - start)

29.87889790534973


In [18]:
#batch size = 32
start = time.time()
for i in range(4):
    inputs = tokenizer.apply_chat_template(
        acctest_dataset[32*i:32*i+32],
        tokenize = True,
        padding = True,
        truncation = False,
        add_generation_prompt = True,
        return_tensors = 'pt').to('cuda')
    attention_mask = (inputs != tokenizer.pad_token_id).int()
    outputs = model.generate(input_ids = inputs, attention_mask = attention_mask, max_new_tokens = 2048, use_cache = True)

print(time.time() - start)

22.34066128730774


In [19]:
#batch size = 64
start = time.time()
for i in range(2):
    inputs = tokenizer.apply_chat_template(
        acctest_dataset[64*i:64*i+64],
        tokenize = True,
        padding = True,
        truncation = False,
        add_generation_prompt = True,
        return_tensors = 'pt').to('cuda')
    attention_mask = (inputs != tokenizer.pad_token_id).int()
    outputs = model.generate(input_ids = inputs, attention_mask = attention_mask, max_new_tokens = 4096, use_cache = True)

print(time.time() - start)

14.629867553710938


In [21]:
# next test prompts without proper engineering
dataset = acctest_dataset[:128].copy()
for i in range(128):
    dataset[i][0]['content'] = "You are a personal judge of video game, your role is to judge which game is preferred by the user based on user's feedbacks of two games"

In [22]:
#batch size = 64
start = time.time()
for i in range(2):
    inputs = tokenizer.apply_chat_template(
        dataset[64*i:64*i+64],
        tokenize = True,
        padding = True,
        truncation = False,
        add_generation_prompt = True,
        return_tensors = 'pt').to('cuda')
    attention_mask = (inputs != tokenizer.pad_token_id).int()
    outputs = model.generate(input_ids = inputs, attention_mask = attention_mask, max_new_tokens = 10240, use_cache = True)

print(time.time() - start)

70.604172706604


In [31]:
for i in range(128):
    dataset[i][0]['content'] = "You are a personal judge of video game, your role is to judge which game is preferred by the user based on user's feedbacks of two games. Simply reply with prefered game's name, no need for explanation"

In [16]:
# next part: calculation the prediction accuracy of epoch 0-499
import torch
torch.cuda.empty_cache()

In [5]:
import re
def normalize_title(title: str):
    title = title.replace('.', '')
    title = title.replace(',', '')
    title = title.replace('/', '')
    title = title.replace(' ', '')
    title = title.replace("'", '')
    title = title.replace("-", '')
    # Remove non-ascii characters
    title = title.encode("ascii", errors="ignore").decode()
    # Remove extra spaces
    title = re.sub(' +', ' ', title)
    # Convert to lowercase
    title = title.lower()
    # Strip leading and trailing spaces
    title = title.strip()
    return title

In [6]:
dataset_noans = []
for i in range(20000):
    temp = acctest_dataset[i]
    temp = temp[:2]
    dataset_noans.append(temp)

In [7]:
dataset_noans[0]

[{'role': 'system',
  'content': "You are a personal judge of video game, your role is to judge which game is preferred by the user based on user's feedbacks of two games. Simply reply with prefered game's name, no need for explanation"},
 {'role': 'user',
  'content': "I played two games factorio, and war for the overworld. After playing, I gave reviews for both games as follow:\nfactorio: EDIT - Several years later. Its still amazing, still worth a purchase even at full price.Purchased from their own site way back when. A game not to be missed.\nwar for the overworld: Shouldn't be out of BETA yet, still a buggy mess.Looks like the original though so got that going for it.  Worth a purchase when they sort everything out but shouldn't of been released yet.  If only I could take back my Kickstarter."}]

In [8]:
#sample output
inputs = tokenizer.apply_chat_template(
    dataset_noans[128:132],
    tokenize = True,
    padding = True,
    truncation = False,
    add_generation_prompt = True,
    return_tensors = 'pt').to('cuda')
attention_mask = (inputs != tokenizer.pad_token_id).int()
outputs = model.generate(input_ids = inputs, attention_mask = attention_mask, max_new_tokens = 64, use_cache = True)
gen_idx = len(inputs[0])
result = tokenizer.batch_decode(outputs[:, gen_idx:], skip_special_tokens = True)
result

NameError: name 'tokenizer' is not defined

In [8]:
correct_counts = []
for i in range(9, 500, 10):
    print(f"starting for model epoch_{i}")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = f"ll3_fitting/epoch_{i}",
        max_seq_length = 4096,
        dtype = None,
        load_in_4bit = True
    )

    tokenizer.pad_token = '<|reserved_special_token_250|>'
    tokenizer.pad_token_id = 128255

    FastLanguageModel.for_inference(model)

    correct_counter = 0
    for j in range(0, 20000, 6):
        msgs = dataset_noans[j:j+6]
        inputs = tokenizer.apply_chat_template(
            msgs,
            tokenize = True,
            padding = True,
            truncation = False,
            add_generation_prompt = True, # Must add for generation
            return_tensors = "pt",
        ).to("cuda")
        attention_mask = (inputs != tokenizer.pad_token_id).int()
        
        outputs = model.generate(input_ids = inputs, max_new_tokens = 4096, attention_mask = attention_mask, use_cache = True)
        gen_idx = len(inputs[0])
        result = tokenizer.batch_decode(outputs[:, gen_idx:], skip_special_tokens = True)

        for k in range(0, len(result)):
            if normalize_title(result[k]) == normalize_title(acctest_dataset[j+k][2]['content']):
                correct_counter += 1
    print(f'accuracy is {correct_counter/200.0}')
    correct_counts.append(correct_counter)

starting for model epoch_9
==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.988 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


accuracy is 88.535
starting for model epoch_19
==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.988 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
accuracy is 88.445
starting for model epoch_29
==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.988 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [8]:
correct_counts = []
import torch
for i in range(9, 500, 10):
    print(f"starting for model epoch_{i}")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = f"ll3_fitting/epoch_{i}",
        max_seq_length = 4096,
        dtype = None,
        load_in_4bit = True
    )

    tokenizer.pad_token = '<|reserved_special_token_250|>'
    tokenizer.pad_token_id = 128255

    FastLanguageModel.for_inference(model)

    correct_counter = 0
    for j in range(0, 20000, 8):
        msgs = dataset_noans[j:j+8]
        inputs = tokenizer.apply_chat_template(
            msgs,
            tokenize = True,
            padding = True,
            truncation = False,
            add_generation_prompt = True, # Must add for generation
            return_tensors = "pt",
        ).to("cuda")
        attention_mask = (inputs != tokenizer.pad_token_id).int()
        
        outputs = model.generate(input_ids = inputs, max_new_tokens = 4096, attention_mask = attention_mask, use_cache = True)
        gen_idx = len(inputs[0])
        result = tokenizer.batch_decode(outputs[:, gen_idx:], skip_special_tokens = True)

        for k in range(0, len(result)):
            if normalize_title(result[k]) == normalize_title(acctest_dataset[j+k][2]['content']):
                correct_counter += 1
    print(f'accuracy is {correct_counter/200.0}')
    correct_counts.append(correct_counter)

    del model
    del tokenizer
    del inputs
    del attention_mask
    del outputs
    torch.cuda.empty_cache()

starting for model epoch_9
==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.988 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


accuracy is 88.395
starting for model epoch_19
==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.988 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
accuracy is 88.42
starting for model epoch_29
==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.988 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
accuracy is 88.285
starting for model epoch_39
==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.988 GB. Platform = Linux.
O^O/ 

In [9]:
correct_counts

[17679,
 17684,
 17657,
 17684,
 17681,
 17700,
 17676,
 17681,
 17711,
 17733,
 17766,
 17754,
 17757,
 17775,
 17812,
 17803,
 17833,
 17899,
 17928,
 17935,
 17979,
 17999,
 18040,
 18017,
 18037,
 18045,
 18058,
 18049,
 18038,
 18075,
 18068,
 18076,
 18085,
 18061,
 18044,
 18031,
 18051,
 18035,
 18018,
 17996,
 17990,
 18006,
 18014,
 18028,
 18004,
 18019,
 18005,
 17989,
 18046,
 18049]

In [1]:
correct_counts = [17679,
 17684,
 17657,
 17684,
 17681,
 17700,
 17676,
 17681,
 17711,
 17733,
 17766,
 17754,
 17757,
 17775,
 17812,
 17803,
 17833,
 17899,
 17928,
 17935,
 17979,
 17999,
 18040,
 18017,
 18037,
 18045,
 18058,
 18049,
 18038,
 18075,
 18068,
 18076,
 18085,
 18061,
 18044,
 18031,
 18051,
 18035,
 18018,
 17996,
 17990,
 18006,
 18014,
 18028,
 18004,
 18019,
 18005,
 17989,
 18046,
 18049]

In [3]:
# consider epoch 329 is the best
j = 0
for i in range(9, 500, 10):
    print(i, correct_counts[j]/200.0)
    j += 1

9 88.395
19 88.42
29 88.285
39 88.42
49 88.405
59 88.5
69 88.38
79 88.405
89 88.555
99 88.665
109 88.83
119 88.77
129 88.785
139 88.875
149 89.06
159 89.015
169 89.165
179 89.495
189 89.64
199 89.675
209 89.895
219 89.995
229 90.2
239 90.085
249 90.185
259 90.225
269 90.29
279 90.245
289 90.19
299 90.375
309 90.34
319 90.38
329 90.425
339 90.305
349 90.22
359 90.155
369 90.255
379 90.175
389 90.09
399 89.98
409 89.95
419 90.03
429 90.07
439 90.14
449 90.02
459 90.095
469 90.025
479 89.945
489 90.23
499 90.245
