In [1]:
!pip install -U unsloth rouge-score bert-score bleu evaluate datasets torch --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m381.1/381.1 kB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [2]:
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
import torch
import json
from datasets import Dataset
import evaluate

# ---------------- CONFIG ----------------
TEST_FILE = "test500.jsonl"
MAX_NEW_TOKENS = 256
BASE_MODEL = "unsloth/mistral-7b-bnb-4bit"
DEVICE = "cuda"

def load_cleaned_jsonl_dataset(file_path):
    rows = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return Dataset.from_list(rows)

# ---------------- DATA ------------------
ds = load_cleaned_jsonl_dataset(TEST_FILE)

inputs = [r["input"] for r in ds]
refs = [
    r["output"]
    .replace("<EXEC_SUMMARY>", "")
    .replace("</EXEC_SUMMARY>", "")
    .strip()
    for r in ds
]

print(f"Loaded {len(ds)} samples from {TEST_FILE}")


# ---------------- METRICS ---------------
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")


# ---------------- MODEL -----------------
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=4096,
    dtype=torch.float16,
    load_in_4bit=True,
    device_map={"": 0},   # force everything onto GPU
)

FastLanguageModel.for_inference(model)


Loaded 500 samples from test500.jsonl


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

==((====))==  Unsloth 2026.1.2: Fast Mistral patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layer

In [4]:
import tqdm

results = []

# ---------------- GENERATE --------------
for i, prompt in enumerate(tqdm.tqdm(inputs, desc="Generating predictions")):
    batch = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=4096,
    ).to(DEVICE)

    with torch.no_grad():
        out = model.generate(
            **batch,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
        )

    text = tokenizer.decode(out[0], skip_special_tokens=True)
    if "<EXEC_SUMMARY>" in text:
        text = text.split("<EXEC_SUMMARY>")[-1].strip()

    results.append({
        "id": i,
        "input": prompt,
        "reference": refs[i],
        "prediction": text,
    })

PREDICTIONS_FILE = "predictions_base.jsonl"

with open(PREDICTIONS_FILE, "w", encoding="utf-8") as f:
    for r in results:
        json.dump(r, f, ensure_ascii=False)
        f.write("\n")

print(f"Predictions saved to {PREDICTIONS_FILE}")

# ---------------- METRICS ----------------
preds = [r["prediction"] for r in results]
refs_ = [r["reference"] for r in results]

r = rouge.compute(predictions=preds, references=refs_)
b = bleu.compute(predictions=preds, references=[[x] for x in refs_])
bs = bertscore.compute(predictions=preds, references=refs_, lang="en")

print("\n=== BASE MODEL METRICS ===")
print("ROUGE-1:", r["rouge1"])
print("ROUGE-2:", r["rouge2"])
print("ROUGE-L:", r["rougeL"])
print("BLEU:", b["bleu"])
print("BERTScore F1:", sum(bs["f1"]) / len(bs["f1"]))

Generating predictions: 100%|██████████| 500/500 [1:48:24<00:00, 13.01s/it]


Predictions saved to predictions_base.jsonl


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== BASE MODEL METRICS ===
ROUGE-1: 0.0311257725307226
ROUGE-2: 0.014663011583447357
ROUGE-L: 0.0216481594546679
BLEU: 0.004897339677398411
BERTScore F1: 0.8034348131418229


# fixing a small error in val.jsonl


In [None]:
import json
import os

VAL_FILE = "val.jsonl"

try:
    with open(VAL_FILE, 'r') as f:
        lines = f.readlines()
except FileNotFoundError:
    print(f"Error: {VAL_FILE} not found. Please ensure the file exists.")
    # Optionally create a dummy file or exit
    # with open(VAL_FILE, 'w') as f: f.write('[]')
    exit()

# The error was reported for row 17, which is index 16 in a 0-indexed list
error_line_index = 16

if error_line_index < len(lines):
    problematic_line_raw = lines[error_line_index]
    problematic_line = problematic_line_raw.strip()

    print(f"Original problematic line (row {error_line_index + 1}):")
    print(problematic_line)

    fixed_line = problematic_line
    fixed_attempted = False

    try:
        json.loads(problematic_line)
        print("\nLine already appears to be valid JSON. No fix needed.")
    except json.JSONDecodeError as e:
        error_message = str(e)
        print(f"\nJSON parsing failed: {error_message}")

        # Heuristic fix for "Missing a closing quotation mark in string."
        # This assumes the issue is an unclosed string that should end the line,
        # and checks for an odd number of quotes, indicating an unclosed string.
        if ("Missing a closing quotation mark" in error_message or
            "Unterminated string starting at" in error_message) and \
           problematic_line.count('"') % 2 != 0 and \
           not problematic_line.endswith('"'):
            fixed_line = problematic_line + '"'
            fixed_attempted = True
            print("\nAttempting to fix by appending a closing double quote.")
            print(f"Proposed fixed line: {fixed_line}")
        elif "Trailing data" in error_message:
            print("\nError 'Trailing data' detected. A simple quote fix might not be sufficient.")
            print("Consider manually inspecting the line for multiple JSON objects or structural issues.")
        else:
            print("\nCould not apply simple heuristic fix based on the error message.")

    if fixed_attempted:
        # Ensure newline character is preserved for .jsonl format
        lines[error_line_index] = fixed_line + '\n'
        temp_file = VAL_FILE + ".tmp"
        with open(temp_file, 'w') as f:
            f.writelines(lines)

        # Replace the original file with the corrected one
        os.replace(temp_file, VAL_FILE)
        print(f"\nAttempted to fix '{VAL_FILE}'. Please re-run the cell with `load_dataset` to verify if the fix worked.")
    elif not fixed_attempted and problematic_line.count('"') % 2 == 0 and problematic_line.endswith('"'):
        print("\nEven number of quotes and line ends with quote, so the problem might be elsewhere on the line or a more complex structural issue.")
        print("Manual inspection of line 17 in `val.jsonl` is recommended.")
    elif not fixed_attempted:
        print("\nNo automatic fix was applied. Manual inspection of line 17 in `val.jsonl` is recommended.")

else:
    print(f"Error: Line index {error_line_index + 1} is out of bounds for {VAL_FILE} which has {len(lines)} lines. The file might be shorter than expected.")


Original problematic line (row 17):
{"input": "<FINANCIAL_REPORT>\nOur actual results could significantly differ due to many risks, including those -- the risk factors in our SEC filings.\nAn audio replay will be made available on our website shortly after today's call.\nIt is now my pleasure to introduce Anant Bhalla.\nBefore we speak about second quarter results, I want to provide you with three strategy execution updates.\nFirst, we reached agreement with Brookfield on a reinsurance contract that covers both, a portion of our in force, and new business flow.\nWe have filed the agreement with our regulator for approval.\nWe look forward to receiving regulatory approval and closing on the reinsurance treaty.\nShortly after, we would expect the second anticipated equity investment from Brookfield to be completed.\nSecond, we have completed our share repurchase of 9.1 million shares since starting our buyback in the fourth quarter of last year.\nThis fully offset the impact of shares is