In [21]:
"""
Quick‑start: Phi‑2 for N‑best rescoring or generation
----------------------------------------------------------------
• Model repo :  microsoft/phi-2
• Context    :  4 k tokens
• Dtype      :  fp16 recommended
----------------------------------------------------------------
"""

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch, os, numpy as np

# ── config ────────────────────────────────────────────────────
device      = "cuda:2"              # GPU to use
model_name  = "microsoft/phi-2"     # <-- swapped in Phi‑2
dtype       = torch.float16         # fp16 is plenty for scoring
# ──────────────────────────────────────────────────────────────

os.environ["TOKENIZERS_PARALLELISM"] = "false"   # kill the fork warning

# Phi‑2 uses a custom tokenizer implementation, so add trust_remote_code
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map       = device,      # let HF shard onto the chosen GPU
    torch_dtype      = dtype,
    trust_remote_code=True
)

# ensure padding token exists (Phi‑2 shares EOS + PAD = "<|endoftext|>")
tokenizer.pad_token     = tokenizer.eos_token
tokenizer.padding_side  = "right"

# ── quick test ────────────────────────────────────────────────
prompt  = "Correct this sentence: He were the best."
inputs  = tokenizer(prompt, return_tensors="pt").to(model.device)

# guidance: use temperature 0 & greedy decoding for grammar correction
output_ids = model.generate(
    **inputs,
    max_new_tokens = 50,
    temperature    = 0.0,
)
print(tokenizer.decode(output_ids[0], skip_special_tokens=True))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Correct this sentence: He were the best. He was the best.
Answer: He was the best.



In [36]:
from llm_utils import cer_with_gpt2_decoder
import time
import pickle

In [40]:
seed_list = [0]
acoustic_scale = 0.8
llm_weight = 0.7

In [42]:
for seed in seed_list:
    saved_dir = '/data/willett_data/model_transcriptions/'
    model_name_str = f'neurips_transformer_time_masked_seed_{seed}'
    
    print("RUNNING FOR: ", model_name_str)
    
    nbest_path = f"{saved_dir}{model_name_str}_nbest.pkl"
    with open(nbest_path, mode = 'rb') as f:
        nbest = pickle.load(f)
        
    model_outputs_path = f"{saved_dir}{model_name_str}_model_outputs.pkl"
    with open(model_outputs_path, mode = 'rb') as f:
        model_outputs = pickle.load(f)
        
    for i in range(len(model_outputs['transcriptions'])):
        new_trans = [ord(c) for c in model_outputs['transcriptions'][i]] + [0]
        model_outputs['transcriptions'][i] = np.array(new_trans)
        


    # Rescore nbest outputs with LLM
    start_t = time.time()
    llm_out = cer_with_gpt2_decoder(
        model,
        tokenizer,
        nbest[:],
        acoustic_scale,
        model_outputs,
        outputType="speech_sil",
        returnCI=True,
        lengthPenalty=0,
        alpha=llm_weight,
    )

    with open(saved_dir + f"{model_name_str}_llm_outs.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(llm_out['decoded_transcripts'])+ "\n")   # one line per LLM output

RUNNING FOR:  neurips_transformer_time_masked_seed_0


  0%|          | 0/880 [00:00<?, ?it/s]

In [44]:
print(llm_out['cer'])
print(llm_out['wer'])

(0.10942436898492827, 0.10053771003512674, 0.11865446523197064)
(0.1685761047463175, 0.15502609038445705, 0.18244167776031484)


  0%|          | 0/1200 [00:00<?, ?it/s]

(3.2171875, 3.1532265625, 3.280942708333333) (3.08875, 3.029166666666667, 3.149177083333333)
