In [50]:
import unsloth
import torch
import numpy as np


In [51]:
def compute_wer(r, h):
    """
    Calculation of WER with Levenshtein distance.
    Works only for iterables up to 254 elements (uint8).
    O(nm) time ans space complexity.
    Parameters
    ----------
    r : list
    h : list
    Returns
    -------
    int
    Examples
    --------
    >>> wer("who is there".split(), "is there".split())
    1
    >>> wer("who is there".split(), "".split())
    3
    >>> wer("".split(), "who is there".split())
    3
    """
    # initialisation
    import numpy
    d = numpy.zeros((len(r)+1)*(len(h)+1), dtype=numpy.uint8)
    d = d.reshape((len(r)+1, len(h)+1))
    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                d[0][j] = j
            elif j == 0:
                d[i][0] = i

    # computation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                substitution = d[i-1][j-1] + 1
                insertion    = d[i][j-1] + 1
                deletion     = d[i-1][j] + 1
                d[i][j] = min(substitution, insertion, deletion)

    return d[len(r)][len(h)]

def _cer_and_wer(decodedSentences, trueSentences, outputType='speech',
                 returnCI=False):
    allCharErr = []
    allChar = []
    allWordErr = []
    allWord = []
    for x in range(len(decodedSentences)):
        decSent = decodedSentences[x]
        trueSent = trueSentences[x]

        nCharErr = compute_wer([c for c in trueSent], [c for c in decSent])
        if outputType == 'handwriting':
            trueWords = trueSent.replace(">", " > ").split(" ")
            decWords = decSent.replace(">", " > ").split(" ")
        elif outputType == 'speech' or outputType == 'speech_sil':
            trueWords = trueSent.split(" ")
            decWords = decSent.split(" ")
        nWordErr = compute_wer(trueWords, decWords)

        allCharErr.append(nCharErr)
        allWordErr.append(nWordErr)
        allChar.append(len(trueSent))
        allWord.append(len(trueWords))

    cer = np.sum(allCharErr) / np.sum(allChar)
    wer = np.sum(allWordErr) / np.sum(allWord)

    if not returnCI:
        return cer, wer
    else:
        allChar = np.array(allChar)
        allCharErr = np.array(allCharErr)
        allWord = np.array(allWord)
        allWordErr = np.array(allWordErr)

        nResamples = 10000
        resampledCER = np.zeros([nResamples,])
        resampledWER = np.zeros([nResamples,])
        for n in range(nResamples):
            resampleIdx = np.random.randint(0, allChar.shape[0], [allChar.shape[0]])
            resampledCER[n] = np.sum(allCharErr[resampleIdx]) / np.sum(allChar[resampleIdx])
            resampledWER[n] = np.sum(allWordErr[resampleIdx]) / np.sum(allWord[resampleIdx])
        cerCI = np.percentile(resampledCER, [2.5, 97.5])
        werCI = np.percentile(resampledWER, [2.5, 97.5])

        return (cer, cerCI[0], cerCI[1]), (wer, werCI[0], werCI[1])


In [52]:
from unsloth import FastLanguageModel
import torch
max_seq_length = None # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit, 
    cache_dir='/opt/dlami/nvme/cache'
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.8.1: Fast Llama patching. Transformers: 4.55.0.
   \\   /|    NVIDIA L40S. Num GPUs = 1. Max memory: 44.521 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [53]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [61]:
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

from datasets import load_dataset

use_phonemes = False


base_path = '/home/ubuntu/transformers_with_dietcorp/src/neural_decoder/jsonl_files/'

if use_phonemes:
    
    data_files = {
        'val_no_gt': f"{base_path}val_no_gt_with_phonemes.jsonl",
        'val': f"{base_path}val_with_phonemes.jsonl",
        'train': f'{base_path}train_with_phonemes.jsonl',
        'test': f'{base_path}test_with_phonemes.jsonl'
    }
    
    dataset = load_dataset(
        "json",
        data_files=data_files
    )
    
    def formatting_func(examples, split_name):
        
        texts = []
        
        for t in examples['text']:
            
            if split_name in ('train', 'val'):
                t += EOS_TOKEN
                
            texts.append(t)
        
        return {'text': texts}

else:
    
    data_files = {
        'val_no_gt': f"{base_path}val_no_gt.jsonl",
        'val': f"{base_path}val.jsonl",
        'train': f'{base_path}train.jsonl',
        'test': f'{base_path}eval.jsonl', 
        'train_val': f'{base_path}train_val.jsonl'
    }
    
    dataset = load_dataset(
        "json",
        data_files=data_files
    )

    def formatting_func(examples, split_name):
        texts = []
        
        for p, c in zip(examples["prompt"], examples["completion"]):
            user_text = p[0]["content"]
            assistant_text = c[0]["content"]

            text = (
                "<|start_header_id|>user<|end_header_id|>\n\n"
                + user_text
                + "\n<|start_header_id|>assistant<|end_header_id|>\n\n"
                + assistant_text
            )

            # Only add EOS_TOKEN for train and val
            if split_name in ("train", "val", "train_val"):
                text += EOS_TOKEN

            texts.append(text)
            
        return {"text": texts}

# Apply formatting function with split awareness
for split in dataset.keys():
    dataset[split] = dataset[split].map(
        lambda ex: formatting_func(ex, split),
        batched=True
    )
        
seed = 3407
dataset["train"] = dataset["train"].shuffle(seed=seed)

Generating val_no_gt split: 880 examples [00:00, 227193.62 examples/s]
Generating val split: 880 examples [00:00, 220990.75 examples/s]
Generating train split: 8800 examples [00:00, 679539.64 examples/s]
Generating test split: 1200 examples [00:00, 306900.29 examples/s]
Generating train_val split: 10560 examples [00:00, 514417.37 examples/s]
Map: 100%|██████████| 880/880 [00:00<00:00, 42498.42 examples/s]
Map: 100%|██████████| 880/880 [00:00<00:00, 50729.65 examples/s]
Map: 100%|██████████| 8800/8800 [00:00<00:00, 51606.76 examples/s]
Map: 100%|██████████| 1200/1200 [00:00<00:00, 51404.46 examples/s]
Map: 100%|██████████| 10560/10560 [00:00<00:00, 52432.77 examples/s]


In [48]:
from trl import SFTConfig, SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset['train_val'],
    dataset_text_field = "text",
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        completion_only_loss=True,
        per_device_train_batch_size = 16,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        #max_steps=50,
        #num_train_epochs=1,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = 'linear',
        seed = 3407,
        output_dir = "outputs",
        report_to = "none" # Use this for WandB etc, 
    ),
)

from unsloth import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 8800/8800 [00:02<00:00, 3055.21 examples/s]
Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 880/880 [00:01<00:00, 835.77 examples/s] 
Map (num_proc=64): 100%|██████████| 8800/8800 [00:00<00:00, 20057.65 examples/s]
Map (num_proc=64): 100%|██████████| 880/880 [00:00<00:00, 2044.85 examples/s]


In [35]:
training = True
if training == False:
    
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/opt/dlami/nvme/saved_lora/lora_model_V3", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

    
else:
    
    trainer_stats = trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 8,800 | Num Epochs = 1 | Total steps = 550
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 1 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,0.3688
2,0.3445
3,0.2358
4,0.2445
5,0.1293
6,0.1143
7,0.2103
8,0.083
9,0.1147
10,0.0915


In [38]:
batch_size = 1
split = "val_no_gt"

last_lines_val = []
n = len(dataset[split])

for batch_idx in range(880):
    if batch_idx % 100 == 0:
        print(batch_idx)

    # Grab a batch of texts
    batch = dataset[split][batch_idx]
    val_texts = batch["text"]
    
    # Tokenize the batch
    inputs = tokenizer(
        val_texts,
        return_tensors='pt',
        padding=True,
        truncation=True
    ).to('cuda')

    # Generate
    outputs = model.generate(
        **inputs,
        use_cache=True, 
        max_new_tokens=400
    )

    # Decode all outputs
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Keep just the last non-empty line from each sequence
    for seq in decoded:
        lines = [l.strip() for l in seq.splitlines() if l.strip()]
        last_line = lines[-1] if lines else ""
        last_lines_val.append(last_line)

        # Stop if "assistant" is in the last line
        if "assistant" in last_line:
            print(f"Stopping early — 'assistant' found in batch {batch_idx}")
            break
    else:
        # Only continues outer loop if no break happened inside
        continue
    
    break


0
100
200
300
400
500
600
700
800


In [37]:
last_lines_val

['this socks recounted',
 'wicks purchase several sand lithographs',
 'the rules were made in unabashed collusion',
 'crow costume needed black gloves to be completely silent',
 'the tooth fairy forgot to come one wider tooth fell out',
 'that stinging part was sourdough by caid preparation',
 'before they game was woo ever formally',
 'wildflower do sunah has perked']

In [39]:
with open("/home/ubuntu/data/model_transcriptions/txt_files/ground_truth_sentences.txt", "r", encoding="utf-8") as f:
    val_gt_lines = [line.strip() for line in f]

metrics = _cer_and_wer(last_lines_val, val_gt_lines)
print(metrics)


(np.float64(0.0814962774650445), np.float64(0.11783960720130933))


In [11]:
model.save_pretrained("/opt/dlami/nvme/saved_lora/70B_model")  # Local saving
tokenizer.save_pretrained("/opt/dlami/nvme/saved_lora/70B_model")

('/opt/dlami/nvme/saved_lora/70B_model/tokenizer_config.json',
 '/opt/dlami/nvme/saved_lora/70B_model/special_tokens_map.json',
 '/opt/dlami/nvme/saved_lora/70B_model/chat_template.jinja',
 '/opt/dlami/nvme/saved_lora/70B_model/tokenizer.json')

In [12]:
FastLanguageModel.for_inference(model) 
last_lines = []
for i in range(1200):
    
    if i % 100 == 0:
        print(i)

    val_texts = dataset['test'][i]['text']
    # Tokenize the whole batch
    inputs = tokenizer(
        val_texts,
        return_tensors='pt',
        padding=True,
        truncation=True
    ).to('cuda')


    # Generate for all at once
    outputs = model.generate(
        **inputs,
        use_cache=True, 
        max_new_tokens=400
    )

    # Decode all output
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    for seq in decoded:  # decoded is the batch_decode output
        # Split into lines, strip whitespace, and take the last non-empty one
        lines = [l.strip() for l in seq.splitlines() if l.strip()]
        last_lines.append(lines[-1] if lines else "")
        

0
100
200
300
400
500
600
700
800
900
1000
1100


In [13]:
with open("llama_70B.txt", "w", encoding="utf-8") as f:
    for line in last_lines:
        f.write(line + "\n")

In [39]:
val_texts

'Your task is to perform automatic speech recognition. Below are multiple candidate transcriptions together with their corresponding phoneme representations. The phonemes are taken from the CMU Pronouncing Dictionary. The special symbol SIL represents the start of the sentence, or the end of the sentence, or the space between two adjacent words. Based on the transcription candidates and their phoneme representations, come up with a transcription and its corresponding phoneme representation that are most accurate, ensuring the transcription is contextually and grammatically correct. Focus on key differences in the candidates that change the meaning or correctness. Avoid selections with repetitive or nonsensical phrases. In cases of ambiguity, select the option that is most coherent and contextually sound, taking clues from the phoneme representations. The candidate phoneme representations may not always be the correct representation of the corresponding candidate transcriptions. Some ph