# VLSP Shared Task 2025

In [1]:
import unsloth
import numpy as np
import pandas as pd

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


Skipping import of cpp extensions due to incompatible torch version 2.9.1+cu126 for torchao version 0.14.1             Please see https://github.com/pytorch/ao/issues/2919 for more info
W1219 20:42:20.346000 11124 Lib\site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


ü¶• Unsloth Zoo will now patch everything to make training faster!


## Data Preparation

In [2]:
import json
import re
from pathlib import Path
from typing import List, Dict, Tuple
from sklearn.model_selection import train_test_split

In [3]:
class MedicalDataPreprocessor:
    """Ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu y t·∫ø cho VLSP Medical MT"""
    
    def __init__(self, data_dir: str = "vlsp-dataset-dedup"):
        self.data_dir = Path(data_dir)
        self.original_chars = set()
        self.filtered_chars = set()
    
    def load_parallel_data(self, src_file: str, tgt_file: str) -> List[Tuple[str, str]]:
        """ƒê·ªçc d·ªØ li·ªáu song ng·ªØ t·ª´ 2 file txt"""
        src_path = self.data_dir / src_file
        tgt_path = self.data_dir / tgt_file
        
        # Th·ª≠ nhi·ªÅu encoding kh√°c nhau
        encodings = ['utf-8']
        
        src_lines = None
        tgt_lines = None
        
        for encoding in encodings:
            try:
                with open(src_path, 'r', encoding=encoding, errors='replace') as f_src:
                    src_lines = [line.strip() for line in f_src]
                break
            except UnicodeDecodeError:
                continue
        
        for encoding in encodings:
            try:
                with open(tgt_path, 'r', encoding=encoding, errors='replace') as f_tgt:
                    tgt_lines = [line.strip() for line in f_tgt]
                break
            except UnicodeDecodeError:
                continue
        
        if src_lines is None or tgt_lines is None:
            raise ValueError(f"Kh√¥ng th·ªÉ ƒë·ªçc file v·ªõi c√°c encoding: {encodings}")
        
        assert len(src_lines) == len(tgt_lines), f"S·ªë d√≤ng src ({len(src_lines)}) v√† tgt ({len(tgt_lines)}) kh√¥ng kh·ªõp"
        
        # L·ªçc c√°c c·∫∑p c√¢u r·ªóng (c·∫£ 2 ph·∫£i kh√¥ng r·ªóng)
        pairs = [(s, t) for s, t in zip(src_lines, tgt_lines) if s and t]
        return pairs
    
    def create_instruction_format(self, pairs: List[Tuple[str, str]], 
                                   direction: str = "en2vi") -> List[Dict[str, str]]:
        """Chuy·ªÉn ƒë·ªïi sang format instruction cho LLM fine-tuning"""
        instructions = {
            "en2vi": "Translate the following medical text from English to Vietnamese.",
            "vi2en": "Translate the following medical text from Vietnamese to English."
        }
        
        instruction = instructions[direction]
        dataset = []
        
        for src, tgt in pairs:
            dataset.append({
                "instruction": instruction,
                "input": src,
                "output": tgt
            })
        
        return dataset

    def create_bidirectional_format(self, pairs: List[Tuple[str, str]]) -> List[Dict[str, str]]:
        """T·∫°o dataset bidirectional: nh√¢n ƒë√¥i d·ªØ li·ªáu cho c·∫£ EN‚ÜíVI v√† VI‚ÜíEN"""
        instructions = {
            "en2vi": "Translate the following medical text from English to Vietnamese.",
            "vi2en": "Translate the following medical text from Vietnamese to English."
        }
        
        dataset = []
        
        for en_text, vi_text in pairs:
            # M·∫´u EN‚ÜíVI
            dataset.append({
                "instruction": instructions["en2vi"],
                "input": en_text,
                "output": vi_text
            })
            
            # M·∫´u VI‚ÜíEN
            dataset.append({
                "instruction": instructions["vi2en"],
                "input": vi_text,
                "output": en_text
            })
        
        return dataset
    
    def prepare_training_data(self, train_src: str, train_tgt: str, 
                             val_split: float = 0.1,
                             max_samples: int = 25000,
                             save_dir: str = "./processed_data") -> Dict[str, int]:
        """Chu·∫©n b·ªã d·ªØ li·ªáu hu·∫•n luy·ªán: load, format v√† chia train/val"""
        print(f"  ƒêang load d·ªØ li·ªáu t·ª´ {train_src} v√† {train_tgt}...")
        pairs = self.load_parallel_data(train_src, train_tgt)
        
        # Gi·ªõi h·∫°n s·ªë l∆∞·ª£ng m·∫´u
        if max_samples > 0 and len(pairs) > max_samples:
            pairs = pairs[:max_samples]
            print(f"  ƒê√£ gi·ªõi h·∫°n xu·ªëng {max_samples} c·∫∑p c√¢u")
        
        print(f"  ƒê√£ load {len(pairs)} c·∫∑p c√¢u")
        
        # T·∫°o dataset
        print(f"  Chuy·ªÉn ƒë·ªïi sang format BIDIRECTIONAL (EN‚ÜîVI)...")
        dataset = self.create_bidirectional_format(pairs)
        print(f"  ƒê√£ nh√¢n ƒë√¥i d·ªØ li·ªáu: {len(pairs)} c·∫∑p ‚Üí {len(dataset)} m·∫´u")
        direction_suffix = "bidirectional"

        
        print(f"  Chia train/val v·ªõi t·ª∑ l·ªá {1-val_split:.1%}/{val_split:.1%}...")
        train_data, val_data = train_test_split(dataset, test_size=val_split, random_state=42)
        
        save_path = Path(save_dir)
        save_path.mkdir(parents=True, exist_ok=True)
        
        train_file = save_path / f"train_{direction_suffix}.json"
        val_file = save_path / f"val_{direction_suffix}.json"
        
        with open(train_file, 'w', encoding='utf-8') as f:
            json.dump(train_data, f, ensure_ascii=False, indent=2)
        
        with open(val_file, 'w', encoding='utf-8') as f:
            json.dump(val_data, f, ensure_ascii=False, indent=2)
        
        print(f"  ƒê√£ l∆∞u:")
        print(f"   - Train: {train_file} ({len(train_data)} m·∫´u)")
        print(f"   - Val: {val_file} ({len(val_data)} m·∫´u)")
        
        return {
            "train_size": len(train_data),
            "val_size": len(val_data),
            "total": len(dataset)
        }
    
    def prepare_test_data(self, test_src: str, test_tgt: str,
                         direction: str = "en2vi",
                         save_dir: str = "./processed_data") -> int:
        """Chu·∫©n b·ªã d·ªØ li·ªáu test"""
        print(f"  ƒêang load test data t·ª´ {test_src} v√† {test_tgt}...")
        pairs = self.load_parallel_data(test_src, test_tgt)
        print(f"‚úì ƒê√£ load {len(pairs)} c·∫∑p c√¢u test")
        
        dataset = self.create_instruction_format(pairs, direction)
        
        save_path = Path(save_dir)
        save_path.mkdir(parents=True, exist_ok=True)
        test_file = save_path / f"test_{direction}.json"
        
        with open(test_file, 'w', encoding='utf-8') as f:
            json.dump(dataset, f, ensure_ascii=False, indent=2)
        
        print(f"üíæ ƒê√£ l∆∞u test: {test_file} ({len(dataset)} m·∫´u)")
        return len(dataset)
    
    def get_statistics(self, dataset: List[Dict[str, str]]) -> Dict:
        """Th·ªëng k√™ dataset"""
        input_lens = [len(d['input'].split()) for d in dataset]
        output_lens = [len(d['output'].split()) for d in dataset]
        
        return {
            "num_samples": len(dataset),
            "avg_input_len": sum(input_lens) / len(input_lens),
            "avg_output_len": sum(output_lens) / len(output_lens),
            "max_input_len": max(input_lens),
            "max_output_len": max(output_lens)
        }

## Fine-tuning v·ªõi Unsloth

In [4]:
from unsloth import FastLanguageModel
import json
import torch
from pathlib import Path
from datasets import Dataset
from transformers import TrainingArguments
from trl import SFTTrainer, SFTConfig

In [5]:
class UnslothMedicalMTTrainer:
    """Fine-tune Qwen 2.5/3 models v·ªõi Unsloth cho VLSP Medical MT"""

    def __init__(self, model_name: str = "unsloth/Qwen3-1.7B-Instruct-bnb-4bit",
                 max_seq_length: int = 512):
        self.model_name = model_name
        self.max_seq_length = max_seq_length
        self.model = None
        self.tokenizer = None

    def load_model(self, lora_r: int = 16, lora_alpha: int = 32,
                   lora_dropout: float = 0.05):
        """Load model v·ªõi Unsloth v√† c·∫•u h√¨nh LoRA"""
        print(f"   ƒêang load model: {self.model_name}")

        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=self.model_name,
            max_seq_length=self.max_seq_length,
            dtype=None,  # Auto detect
            load_in_4bit=True,
        )

        print("   C·∫•u h√¨nh LoRA adapters...")
        self.model = FastLanguageModel.get_peft_model(
            self.model,
            r=lora_r,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                          "gate_proj", "up_proj", "down_proj"],
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            bias="none",
            use_gradient_checkpointing=False,
            random_state=42,
            use_rslora=False,
            loftq_config=None,
        )

        print("   Model ƒë√£ s·∫µn s√†ng!")
        return self.model, self.tokenizer

    def format_prompt_chatml(self, instruction: str, input_text: str,
                            output_text: str = None) -> str:
        """Format prompt theo ChatML template c·ªßa Qwen 2.5 Instruct"""
        prompt = f"""<|im_start|>system
You are a helpful medical translation assistant.<|im_end|>
<|im_start|>user
{instruction}
{input_text}<|im_end|>
<|im_start|>assistant
"""
        if output_text:
            prompt += f"{output_text}<|im_end|>"

        return prompt

    def load_and_format_dataset(self, json_file: str) -> Dataset:
        """Load v√† format dataset t·ª´ file JSON"""
        print(f"   ƒêang load d·ªØ li·ªáu t·ª´ {json_file}...")

        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Format theo ChatML
        formatted_data = []
        for sample in data:
            prompt = self.format_prompt_chatml(
                instruction=sample['instruction'],
                input_text=sample['input'],
                output_text=sample['output']
            )
            formatted_data.append({"text": prompt})

        dataset = Dataset.from_list(formatted_data)
        print(f"   ƒê√£ load {len(dataset)} m·∫´u")

        return dataset

    def train(self, train_file: str, val_file: str,
              output_dir: str = "./vlsp_medical_mt",
              num_train_epochs: int = 3,
              batch_size: int = 4,
              gradient_accumulation_steps: int = 16,
              learning_rate: float = 2e-4,
              warmup_steps: int = 100):
        """Hu·∫•n luy·ªán model"""

        if self.model is None:
            raise ValueError("Ch∆∞a load model! G·ªçi load_model() tr∆∞·ªõc.")

        # Load datasets
        train_dataset = self.load_and_format_dataset(train_file)
        eval_dataset = self.load_and_format_dataset(val_file)

        print(f"\n   Th√¥ng tin hu·∫•n luy·ªán:")
        print(f"   - Train samples: {len(train_dataset)}")
        print(f"   - Val samples: {len(eval_dataset)}")
        print(f"   - Epochs: {num_train_epochs}")
        print(f"   - Batch size: {batch_size}")
        print(f"   - Gradient accumulation: {gradient_accumulation_steps}")
        print(f"   - Effective batch size: {batch_size * gradient_accumulation_steps}")
        print(f"   - Learning rate: {learning_rate}")

        use_bf16 = torch.cuda.get_device_capability()[0] >= 8
        print(f"   - Use_BF16: {use_bf16}")
        
        # Training arguments
        training_args = SFTConfig(
            output_dir=output_dir,
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            warmup_steps=warmup_steps,
            learning_rate=learning_rate,
            fp16=not use_bf16,
            bf16=use_bf16,
            logging_steps=10,
            eval_strategy="steps",
            eval_steps=1000,
            save_strategy="steps",
            save_steps=1000,
            save_total_limit=2,
            optim="adamw_torch",
            weight_decay=0.01,
            lr_scheduler_type="cosine",
            seed=42,
            report_to="none",
            dataset_text_field="text",
            max_length=self.max_seq_length,
            packing=False,
            group_by_length=True,
            dataset_num_proc=1
        )

        # Trainer
        trainer = SFTTrainer(
            model=self.model,
            processing_class=self.tokenizer,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            args=training_args,
        )

        print("\n   B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán...")
        trainer.train()

        print(f"\n   L∆∞u model v√†o {output_dir}")
        trainer.save_model(output_dir)
        self.tokenizer.save_pretrained(output_dir)

        print("   Hu·∫•n luy·ªán ho√†n t·∫•t!")
        return trainer

    def inference(self, input_text: str, instruction: str = None,
                  max_new_tokens: int = 256) -> str:
        """D·ªãch m·ªôt c√¢u v·ªõi model ƒë√£ fine-tune"""
        if instruction is None:
            instruction = "Translate the following medical text from English to Vietnamese."
            

        prompt = self.format_prompt_chatml(instruction, input_text, output_text=None)

        FastLanguageModel.for_inference(self.model)

        inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")

        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.3,
            top_p=0.9,
            repetition_penalty=1.2,
            do_sample=True,
            pad_token_id=self.tokenizer.eos_token_id,
        )

        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=False)

        # Extract only assistant response
        if "<|im_start|>assistant" in generated_text:
            response = generated_text.split("<|im_start|>assistant")[-1]
            response = response.split("<|im_end|>")[0].strip()
            return response

        return generated_text

    def batch_inference(self, test_file: str, output_file: str,
                       batch_size: int = 8):
        """D·ªãch to√†n b·ªô test set"""
        print(f"   ƒêang d·ªãch test set: {test_file}")

        with open(test_file, 'r', encoding='utf-8') as f:
            test_data = json.load(f)

        results = []
        FastLanguageModel.for_inference(self.model)

        for i in range(0, len(test_data), batch_size):
            batch = test_data[i:i+batch_size]

            for sample in batch:
                prompt = self.format_prompt_chatml(
                    sample['instruction'],
                    sample['input'],
                    output_text=None
                )

                inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=256,
                    temperature=0.3,
                    top_p=0.9,
                    repetition_penalty=1.2,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id,
                )

                generated = self.tokenizer.decode(outputs[0], skip_special_tokens=False)

                if "<|im_start|>assistant" in generated:
                    prediction = generated.split("<|im_start|>assistant")[-1]
                    prediction = prediction.split("<|im_end|>")[0].strip()
                else:
                    prediction = generated

                results.append({
                    "input": sample['input'],
                    "reference": sample['output'],
                    "prediction": prediction
                })

            if (i // batch_size + 1) % 10 == 0:
                print(f"   Processed {i+len(batch)}/{len(test_data)} samples...")

        # Save results
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)

        print(f"   ƒê√£ l∆∞u k·∫øt qu·∫£ v√†o {output_file}")
        return results

## ƒê√°nh gi√° v√† xu·∫•t k·∫øt qu·∫£

In [6]:
import json
import re
import torch
from pathlib import Path
from typing import List, Dict, Tuple
from collections import Counter
import sacrebleu
from unsloth import FastLanguageModel

In [7]:
class MedicalMTEvaluator:
    """ƒê√°nh gi√° v√† ph√¢n t√≠ch l·ªói cho Medical MT"""

    def __init__(self, model_dir: str, max_seq_length: int = 256):
        self.model_dir = model_dir
        self.max_seq_length = max_seq_length
        self.model = None
        self.tokenizer = None

    def load_finetuned_model(self):
        """Load model ƒë√£ fine-tune"""
        print(f"   ƒêang load model t·ª´ {self.model_dir}")

        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=self.model_dir,
            max_seq_length=self.max_seq_length,
            dtype=None,
            load_in_4bit=True,
        )

        FastLanguageModel.for_inference(self.model)
        print("   Model ƒë√£ s·∫µn s√†ng cho inference")

    def controlled_inference(self, prompt: str, max_new_tokens: int = 256,
                           temperature: float = 0.3, top_p: float = 0.85,
                           repetition_penalty: float = 1.1) -> str:
        """Inference c·∫©n th·∫≠n v·ªõi c√°c tham s·ªë tr√°nh hallucination"""

        inputs = self.tokenizer(prompt, return_tensors="pt",
                               truncation=True, max_length=self.max_seq_length).to("cuda")

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,  # Th·∫•p h∆°n = ·ªïn ƒë·ªãnh h∆°n
                top_p=top_p,              # H·∫°n ch·∫ø sampling
                top_k=40,                 # Gi·ªõi h·∫°n top tokens
                repetition_penalty=repetition_penalty,  # Tr√°nh l·∫∑p t·ª´
                do_sample=True if temperature > 0 else False,
                num_beams=1,              # Greedy n·∫øu c·∫ßn deterministic
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
            )

        generated = self.tokenizer.decode(outputs[0], skip_special_tokens=False)

        # Extract ch·ªâ ph·∫ßn assistant response
        if "<|im_start|>assistant" in generated:
            response = generated.split("<|im_start|>assistant")[-1]
            response = response.split("<|im_end|>")[0].strip()
        else:
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            response = response.split("assistant")[-1].strip() if "assistant" in response else response

        # Lo·∫°i b·ªè c√°c artifacts kh√¥ng mong mu·ªën
        response = self.clean_output(response)

        return response

    def clean_output(self, text: str) -> str:
        """L√†m s·∫°ch output: lo·∫°i b·ªè special tokens v√† artifacts"""
        # Lo·∫°i b·ªè special tokens c√≤n s√≥t
        text = re.sub(r'<\|.*?\|>', '', text)
        # Lo·∫°i b·ªè meta-comments nh∆∞ "Here is the translation:"
        text = re.sub(r'^(Here is|The translation is|Translated text)[:\s]+', '', text, flags=re.IGNORECASE)
        # Lo·∫°i trailing whitespace
        text = text.strip()
        return text

    def format_prompt(self, instruction: str, input_text: str) -> str:
        """Format prompt cho inference"""
        return f"""<|im_start|>system
You are a helpful medical translation assistant.<|im_end|>
<|im_start|>user
{instruction}
{input_text}<|im_end|>
<|im_start|>assistant
"""

    def translate_test_set(self, test_file: str, output_file: str,
                          direction: str = None,
                          batch_progress: int = 50):
        """D·ªãch to√†n b·ªô test set v·ªõi inference c·∫©n th·∫≠n"""
        
        print(f"  ƒêang d·ªãch test set: {test_file}")
        
        if direction is None:
            if "en2vi" in test_file:
                direction = "en2vi"
            elif "vi2en" in test_file:
                direction = "vi2en"
            else:
                direction = "en2vi"  # Default
                print(f"  Kh√¥ng ph√°t hi·ªán ƒë∆∞·ª£c direction t·ª´ filename, d√πng m·∫∑c ƒë·ªãnh: {direction}")
        
        instructions = {
            "en2vi": "Translate the following medical text from English to Vietnamese.",
            "vi2en": "Translate the following medical text from Vietnamese to English."
        }
        instruction = instructions[direction]
        
        print(f"  Direction: {direction}")
        print(f"  Instruction: {instruction}")
        
        with open(test_file, 'r', encoding='utf-8') as f:
            test_data = json.load(f)
        
        predictions = []
        references = []
        inputs = []
        
        for i, sample in enumerate(test_data):
            prompt = self.format_prompt(instruction, sample['input'])
            
            prediction = self.controlled_inference(
                prompt,
                max_new_tokens=256,
                temperature=0.3,
                top_p=0.85,
                repetition_penalty=1.1
            )
            
            predictions.append(prediction)
            references.append(sample['output'])
            inputs.append(sample['input'])
            
            if (i + 1) % batch_progress == 0:
                print(f"   ƒê√£ d·ªãch: {i+1}/{len(test_data)} samples")
        
        # L∆∞u k·∫øt qu·∫£
        results = []
        for inp, ref, pred in zip(inputs, references, predictions):
            results.append({
                "input": inp,
                "reference": ref,
                "prediction": pred
            })
        
        Path(output_file).parent.mkdir(parents=True, exist_ok=True)
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        
        print(f"  ƒê√£ l∆∞u predictions v√†o {output_file}")
        
        return predictions, references

    def calculate_bleu(self, predictions: List[str], references: List[str]) -> Dict:
        """T√≠nh BLEU score v·ªõi sacrebleu"""

        bleu = sacrebleu.corpus_bleu(predictions, [references])

        results = {
            "BLEU": bleu.score,
            "BLEU_1": bleu.precisions[0],
            "BLEU_2": bleu.precisions[1],
            "BLEU_3": bleu.precisions[2],
            "BLEU_4": bleu.precisions[3],
            "BP": bleu.bp,  # Brevity Penalty
            "ratio": bleu.sys_len / bleu.ref_len,
        }

        return results

    def error_analysis(self, results_file: str, num_examples: int = 20) -> Dict:
        """Ph√¢n t√≠ch l·ªói chi ti·∫øt"""

        print(f"\n   PH√ÇN T√çCH L·ªñI CHI TI·∫æT")
        print("="*70)

        with open(results_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # C√°c lo·∫°i l·ªói c·∫ßn ph√°t hi·ªán
        error_types = {
            "missing_medical_terms": 0,    # Thi·∫øu thu·∫≠t ng·ªØ y t·∫ø
            "wrong_medical_terms": 0,      # D·ªãch sai thu·∫≠t ng·ªØ
            "hallucination": 0,            # Th√™m th√¥ng tin kh√¥ng c√≥
            "incomplete": 0,               # D·ªãch thi·∫øu
            "repetition": 0,               # L·∫∑p t·ª´
            "grammar_issues": 0,           # L·ªói ng·ªØ ph√°p ti·∫øng Vi·ªát
        }

        # Common medical terms ƒë·ªÉ check
        medical_terms = [
            "patient", "symptom", "diagnosis", "treatment", "hypertension",
            "diabetes", "infection", "medication", "surgery", "chronic",
            "acute", "disease", "syndrome", "therapy", "blood pressure"
        ]

        error_examples = []

        for i, item in enumerate(data):
            inp = item['input'].lower()
            ref = item['reference'].lower()
            pred = item['prediction'].lower()

            # Ph√°t hi·ªán hallucination: prediction d√†i h∆°n reference qu√° nhi·ªÅu
            if len(pred.split()) > len(ref.split()) * 1.5:
                error_types["hallucination"] += 1
                if len(error_examples) < num_examples:
                    error_examples.append({
                        "type": "hallucination",
                        "input": item['input'],
                        "reference": item['reference'],
                        "prediction": item['prediction']
                    })

            # Ph√°t hi·ªán incomplete: prediction ng·∫Øn h∆°n qu√° nhi·ªÅu
            if len(pred.split()) < len(ref.split()) * 0.6:
                error_types["incomplete"] += 1
                if len(error_examples) < num_examples:
                    error_examples.append({
                        "type": "incomplete",
                        "input": item['input'],
                        "reference": item['reference'],
                        "prediction": item['prediction']
                    })

            # Ph√°t hi·ªán repetition
            words = pred.split()
            if len(words) != len(set(words)) and len(words) > 0:
                word_counts = Counter(words)
                if any(count > 3 for count in word_counts.values()):
                    error_types["repetition"] += 1

            # Ph√°t hi·ªán missing medical terms
            for term in medical_terms:
                if term in inp and term not in pred:
                    # Check xem c√≥ d·ªãch sang ti·∫øng Vi·ªát kh√¥ng
                    vi_translations = {
                        "patient": "b·ªánh nh√¢n",
                        "hypertension": "tƒÉng huy·∫øt √°p",
                        "diabetes": "ti·ªÉu ƒë∆∞·ªùng",
                        "symptom": "tri·ªáu ch·ª©ng",
                        "treatment": "ƒëi·ªÅu tr·ªã"
                    }
                    if term in vi_translations and vi_translations[term] not in pred:
                        error_types["missing_medical_terms"] += 1
                        break

        # T√≠nh t·ª∑ l·ªá l·ªói
        total = len(data)
        error_rates = {k: (v / total) * 100 for k, v in error_types.items()}

        print(f"\n   Th·ªëng k√™ l·ªói (tr√™n {total} samples):")
        for error_type, rate in error_rates.items():
            count = error_types[error_type]
            print(f"   - {error_type}: {count} ({rate:.2f}%)")

        print(f"\n   V√ç D·ª§ L·ªñI ƒêI·ªÇN H√åNH (Top {min(num_examples, len(error_examples))}):")
        for i, example in enumerate(error_examples[:num_examples]):
            print(f"\n[{i+1}] Lo·∫°i l·ªói: {example['type'].upper()}")
            print(f"Input: {example['input'][:150]}...")
            print(f"Reference: {example['reference'][:150]}...")
            print(f"Prediction: {example['prediction'][:150]}...")
            print("-" * 70)

        return {
            "error_counts": error_types,
            "error_rates": error_rates,
            "error_examples": error_examples
        }

    def evaluate_full_pipeline(self, test_file: str, output_dir: str = "./evaluation", 
                              direction: str = None):
        """Pipeline ƒë·∫ßy ƒë·ªß: translate + evaluate + analyze"""
        
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        
        # Auto-detect direction n·∫øu kh√¥ng ƒë∆∞·ª£c cung c·∫•p
        if direction is None:
            if "en2vi" in test_file:
                direction = "en2vi"
            elif "vi2en" in test_file:
                direction = "vi2en"
            else:
                direction = "en2vi"
        
        print(f"\nüìç Detected direction: {direction.upper()}")
        
        # 1. Translate
        predictions_file = f"{output_dir}/predictions.json"
        predictions, references = self.translate_test_set(
            test_file, 
            predictions_file,
            direction=direction
        )
        
        # 2. Calculate BLEU
        print(f"\n{'='*70}")
        print("üìà ƒê√ÅNH GI√Å BLEU SCORE")
        print("="*70)
        
        bleu_results = self.calculate_bleu(predictions, references)
        
        print(f"\nüéØ K·∫æT QU·∫¢ BLEU:")
        print(f"   - BLEU Score: {bleu_results['BLEU']:.2f}")
        print(f"   - BLEU-1: {bleu_results['BLEU_1']:.2f}")
        print(f"   - BLEU-2: {bleu_results['BLEU_2']:.2f}")
        print(f"   - BLEU-3: {bleu_results['BLEU_3']:.2f}")
        print(f"   - BLEU-4: {bleu_results['BLEU_4']:.2f}")
        print(f"   - Brevity Penalty: {bleu_results['BP']:.3f}")
        print(f"   - Length Ratio: {bleu_results['ratio']:.3f}")
        
        # L∆∞u BLEU results
        with open(f"{output_dir}/bleu_scores.json", 'w', encoding='utf-8') as f:
            json.dump(bleu_results, f, ensure_ascii=False, indent=2)
        
        # 3. Error Analysis
        error_analysis = self.error_analysis(predictions_file, num_examples=10)
        
        # L∆∞u error analysis
        with open(f"{output_dir}/error_analysis.json", 'w', encoding='utf-8') as f:
            json.dump(error_analysis, f, ensure_ascii=False, indent=2)
        
        print(f"\n‚úÖ ƒê√É L∆ØU K·∫æT QU·∫¢ V√ÄO: {output_dir}/")
        print(f"   - predictions.json")
        print(f"   - bleu_scores.json")
        print(f"   - error_analysis.json")
        
        return {
            "bleu": bleu_results,
            "error_analysis": error_analysis
        }

## Main Pipeline

In [8]:
import json
import time
from pathlib import Path
from datetime import datetime

In [9]:
def generate_report(config, data_stats, training_time, eval_results_en2vi, eval_results_vi2en, output_file):
    """T·∫°o b√°o c√°o t·ªïng h·ª£p d·∫°ng txt cho bidirectional model"""
    
    report = []
    report.append("=" * 80)
    report.append("VLSP 2025 MEDICAL MACHINE TRANSLATION - BIDIRECTIONAL REPORT")
    report.append("=" * 80)
    report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append("")
    
    # Configuration
    report.append("-" * 80)
    report.append("1. CONFIGURATION")
    report.append("-" * 80)
    report.append(f"Model: {config['model_name']}")
    report.append(f"Training Mode: BIDIRECTIONAL (EN<->VI)")
    report.append(f"Max Sequence Length: {config['max_seq_length']}")
    report.append(f"LoRA Rank: {config['lora_r']}")
    report.append(f"LoRA Alpha: {config['lora_alpha']}")
    report.append(f"Learning Rate: {config['learning_rate']}")
    report.append(f"Batch Size: {config['batch_size']}")
    report.append(f"Gradient Accumulation Steps: {config['gradient_accumulation_steps']}")
    report.append(f"Effective Batch Size: {config['batch_size'] * config['gradient_accumulation_steps']}")
    report.append(f"Number of Epochs: {config['num_train_epochs']}")
    report.append("")
    
    # Data Statistics
    report.append("-" * 80)
    report.append("2. DATA STATISTICS")
    report.append("-" * 80)
    report.append(f"Original Parallel Pairs: {data_stats['original_pairs']}")
    report.append(f"Training Samples (after doubling): {data_stats['train_size']}")
    report.append(f"Validation Samples: {data_stats['val_size']}")
    report.append(f"Test Samples EN->VI: {data_stats['test_size_en2vi']}")
    report.append(f"Test Samples VI->EN: {data_stats['test_size_vi2en']}")
    report.append(f"Total Training Samples: {data_stats['total']}")
    report.append("")
    
    # Training Summary
    report.append("-" * 80)
    report.append("3. TRAINING SUMMARY")
    report.append("-" * 80)
    report.append(f"Training Time: {training_time:.2f} seconds ({training_time/60:.2f} minutes)")
    report.append("")
    
    # Evaluation Results EN->VI
    report.append("-" * 80)
    report.append("4. EVALUATION RESULTS - ENGLISH TO VIETNAMESE")
    report.append("-" * 80)
    bleu_en2vi = eval_results_en2vi['bleu']
    report.append(f"BLEU Score: {bleu_en2vi['BLEU']:.2f}")
    report.append(f"BLEU-1: {bleu_en2vi['BLEU_1']:.2f}")
    report.append(f"BLEU-2: {bleu_en2vi['BLEU_2']:.2f}")
    report.append(f"BLEU-3: {bleu_en2vi['BLEU_3']:.2f}")
    report.append(f"BLEU-4: {bleu_en2vi['BLEU_4']:.2f}")
    report.append(f"Brevity Penalty: {bleu_en2vi['BP']:.3f}")
    report.append(f"Length Ratio: {bleu_en2vi['ratio']:.3f}")
    report.append("")
    
    # Evaluation Results VI->EN
    report.append("-" * 80)
    report.append("5. EVALUATION RESULTS - VIETNAMESE TO ENGLISH")
    report.append("-" * 80)
    bleu_vi2en = eval_results_vi2en['bleu']
    report.append(f"BLEU Score: {bleu_vi2en['BLEU']:.2f}")
    report.append(f"BLEU-1: {bleu_vi2en['BLEU_1']:.2f}")
    report.append(f"BLEU-2: {bleu_vi2en['BLEU_2']:.2f}")
    report.append(f"BLEU-3: {bleu_vi2en['BLEU_3']:.2f}")
    report.append(f"BLEU-4: {bleu_vi2en['BLEU_4']:.2f}")
    report.append(f"Brevity Penalty: {bleu_vi2en['BP']:.3f}")
    report.append(f"Length Ratio: {bleu_vi2en['ratio']:.3f}")
    report.append("")
    
    # Average BLEU
    report.append("-" * 80)
    report.append("6. AVERAGE PERFORMANCE")
    report.append("-" * 80)
    avg_bleu = (bleu_en2vi['BLEU'] + bleu_vi2en['BLEU']) / 2
    report.append(f"Average BLEU Score: {avg_bleu:.2f}")
    report.append("")
    
    # Error Analysis EN->VI
    report.append("-" * 80)
    report.append("7. ERROR ANALYSIS - ENGLISH TO VIETNAMESE")
    report.append("-" * 80)
    error_en2vi = eval_results_en2vi['error_analysis']
    report.append("Error Distribution:")
    for error_type, count in error_en2vi['error_counts'].items():
        rate = error_en2vi['error_rates'][error_type]
        report.append(f"  - {error_type}: {count} samples ({rate:.2f}%)")
    report.append("")
    
    # Error Analysis VI->EN
    report.append("-" * 80)
    report.append("8. ERROR ANALYSIS - VIETNAMESE TO ENGLISH")
    report.append("-" * 80)
    error_vi2en = eval_results_vi2en['error_analysis']
    report.append("Error Distribution:")
    for error_type, count in error_vi2en['error_counts'].items():
        rate = error_vi2en['error_rates'][error_type]
        report.append(f"  - {error_type}: {count} samples ({rate:.2f}%)")
    report.append("")
    
    # Conclusion
    report.append("-" * 80)
    report.append("9. CONCLUSION")
    report.append("-" * 80)
    report.append("Bidirectional model has been successfully fine-tuned.")
    report.append(f"EN->VI BLEU Score: {bleu_en2vi['BLEU']:.2f}")
    report.append(f"VI->EN BLEU Score: {bleu_vi2en['BLEU']:.2f}")
    report.append(f"Average BLEU Score: {avg_bleu:.2f}")
    report.append("Single model can translate in both directions.")
    report.append("")
    report.append("=" * 80)
    
    # Write to file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(report))
    
    print(f"Report saved to {output_file}")

In [10]:
# def main():
#     """Pipeline ch√≠nh cho Bidirectional training (EN<->VI)"""
    
#     print("=" * 80)
#     print("VLSP 2025 MEDICAL MACHINE TRANSLATION - BIDIRECTIONAL PIPELINE")
#     print("=" * 80)
    
#     # Configuration
#     config = {
#         'model_name': 'unsloth/Qwen3-1.7B-Instruct-bnb-4bit',
#         'max_seq_length': 512,
#         'lora_r': 16,
#         'lora_alpha': 32,
#         'lora_dropout': 0,
#         'learning_rate': 1e-4,
#         'batch_size': 8,
#         'gradient_accumulation_steps': 2,
#         'num_train_epochs': 3,
#         'warmup_steps': 100,
#         'data_dir': '/kaggle/input/vlsp-data-clean/vlsp-dataset-dedup',
#         'processed_dir': '/kaggle/working/',
#         'output_dir': './vlsp_qwen2.5_3b_medical_bidirectional',
#         'eval_dir': './evaluation_results_bidirectional',
#     }
    
#     # Step 1: Data Preparation - BIDIRECTIONAL
#     print("\nStep 1: DATA PREPARATION - BIDIRECTIONAL MODE")
#     print("-" * 80)
    
#     preprocessor = MedicalDataPreprocessor(data_dir=config['data_dir'])
    
#     # Prepare bidirectional training data
#     data_stats = preprocessor.prepare_training_data(
#         train_src='train.en.txt',
#         train_tgt='train.vi.txt',
#         val_split=0.1,
#         max_samples=20000,
#         save_dir=config['processed_dir']
#     )
    
#     # Store original pairs count
#     original_pairs = data_stats['total'] // 2  # V√¨ ƒë√£ nh√¢n ƒë√¥i
#     data_stats['original_pairs'] = original_pairs
    
#     # Prepare test data for both directions
#     test_size_en2vi = preprocessor.prepare_test_data(
#         test_src='public_test.en.txt',
#         test_tgt='public_test.vi.txt',
#         direction='en2vi',
#         save_dir=config['processed_dir']
#     )
    
#     test_size_vi2en = preprocessor.prepare_test_data(
#         test_src='public_test.vi.txt',
#         test_tgt='public_test.en.txt',
#         direction='vi2en',
#         save_dir=config['processed_dir']
#     )
    
#     data_stats['test_size_en2vi'] = test_size_en2vi
#     data_stats['test_size_vi2en'] = test_size_vi2en
    
#     # Step 2: Model Training
#     print("\nStep 2: MODEL TRAINING - BIDIRECTIONAL")
#     print("-" * 80)
    
#     trainer = UnslothMedicalMTTrainer(
#         model_name=config['model_name'],
#         max_seq_length=config['max_seq_length']
#     )
    
#     trainer.load_model(
#         lora_r=config['lora_r'],
#         lora_alpha=config['lora_alpha'],
#         lora_dropout=config['lora_dropout']
#     )
    
#     train_start = time.time()
    
#     trainer.train(
#         train_file=f"{config['processed_dir']}/train_bidirectional.json",
#         val_file=f"{config['processed_dir']}/val_bidirectional.json",
#         output_dir=config['output_dir'],
#         num_train_epochs=config['num_train_epochs'],
#         batch_size=config['batch_size'],
#         gradient_accumulation_steps=config['gradient_accumulation_steps'],
#         learning_rate=config['learning_rate'],
#         warmup_steps=config['warmup_steps']
#     )
    
#     training_time = time.time() - train_start
    
#     # Step 3: Evaluation - Both Directions
#     print("\nStep 3: EVALUATION - BOTH DIRECTIONS")
#     print("-" * 80)
    
#     evaluator = MedicalMTEvaluator(
#         model_dir=config['output_dir'],
#         max_seq_length=config['max_seq_length']
#     )
    
#     evaluator.load_finetuned_model()
    
#     # Evaluate EN->VI
#     print("\nEvaluating EN->VI...")
#     eval_results_en2vi = evaluator.evaluate_full_pipeline(
#         test_file=f"{config['processed_dir']}/test_en2vi.json",
#         output_dir=f"{config['eval_dir']}/en2vi"
#     )
    
#     # Evaluate VI->EN
#     print("\nEvaluating VI->EN...")
#     eval_results_vi2en = evaluator.evaluate_full_pipeline(
#         test_file=f"{config['processed_dir']}/test_vi2en.json",
#         output_dir=f"{config['eval_dir']}/vi2en"
#     )
    
#     # Step 4: Generate Report
#     print("\nStep 4: GENERATE REPORT")
#     print("-" * 80)
    
#     report_file = f"{config['eval_dir']}/final_report_bidirectional.txt"
#     generate_report(config, data_stats, training_time, 
#                    eval_results_en2vi, eval_results_vi2en, report_file)
    
#     print("\n" + "=" * 80)
#     print("BIDIRECTIONAL PIPELINE COMPLETED SUCCESSFULLY")
#     print("=" * 80)
#     print(f"Model saved to: {config['output_dir']}")
#     print(f"Evaluation results saved to: {config['eval_dir']}")
#     print(f"Final report saved to: {report_file}")
#     print(f"EN->VI BLEU Score: {eval_results_en2vi['bleu']['BLEU']:.2f}")
#     print(f"VI->EN BLEU Score: {eval_results_vi2en['bleu']['BLEU']:.2f}")
#     avg_bleu = (eval_results_en2vi['bleu']['BLEU'] + eval_results_vi2en['bleu']['BLEU']) / 2
#     print(f"Average BLEU Score: {avg_bleu:.2f}")
#     print("=" * 80)

## RUN PIPELINE

In [11]:
# main()

In [None]:
print("=" * 80)
print("VLSP 2025 MEDICAL MACHINE TRANSLATION - BIDIRECTIONAL PIPELINE")
print("=" * 80)

# Configuration
config = {
    'model_name': 'unsloth/Qwen3-1.7B-unsloth-bnb-4bit',
    'max_seq_length': 512,
    'lora_r': 16,
    'lora_alpha': 32,
    'lora_dropout': 0.05,
    'learning_rate': 2e-4,
    'batch_size': 16,
    'gradient_accumulation_steps': 1,
    'num_train_epochs': 2,
    'warmup_steps': 100,
    'data_dir': 'vlsp-dataset-dedup',
    'processed_dir': '/processed_new',
    'output_dir': './vlsp_qwen3_1.7b_medical_bidirectional',
    'eval_dir': './evaluation_results_bidirectional_new',
}

# Step 1: Data Preparation - BIDIRECTIONAL
print("\nStep 1: DATA PREPARATION - BIDIRECTIONAL MODE")
print("-" * 80)

preprocessor = MedicalDataPreprocessor(data_dir=config['data_dir'])

# Prepare bidirectional training data
data_stats = preprocessor.prepare_training_data(
    train_src='train.en.txt',
    train_tgt='train.vi.txt',
    val_split=0.1,
    max_samples=350000,
    save_dir=config['processed_dir']
)

# Store original pairs count
original_pairs = data_stats['total'] // 2  # V√¨ ƒë√£ nh√¢n ƒë√¥i
data_stats['original_pairs'] = original_pairs

# Prepare test data for both directions
test_size_en2vi = preprocessor.prepare_test_data(
    test_src='public_test.en.txt',
    test_tgt='public_test.vi.txt',
    direction='en2vi',
    save_dir=config['processed_dir']
)

test_size_vi2en = preprocessor.prepare_test_data(
    test_src='public_test.vi.txt',
    test_tgt='public_test.en.txt',
    direction='vi2en',
    save_dir=config['processed_dir']
)

data_stats['test_size_en2vi'] = test_size_en2vi
data_stats['test_size_vi2en'] = test_size_vi2en

# Step 2: Model Training
print("\nStep 2: MODEL TRAINING - BIDIRECTIONAL")
print("-" * 80)

trainer = UnslothMedicalMTTrainer(
    model_name=config['model_name'],
    max_seq_length=config['max_seq_length']
)

trainer.load_model(
    lora_r=config['lora_r'],
    lora_alpha=config['lora_alpha'],
    lora_dropout=config['lora_dropout']
)

train_start = time.time()

trainer.train(
    train_file=f"{config['processed_dir']}/train_bidirectional.json",
    val_file=f"{config['processed_dir']}/val_bidirectional.json",
    output_dir=config['output_dir'],
    num_train_epochs=config['num_train_epochs'],
    batch_size=config['batch_size'],
    gradient_accumulation_steps=config['gradient_accumulation_steps'],
    learning_rate=config['learning_rate'],
    warmup_steps=config['warmup_steps']
)

training_time = time.time() - train_start

VLSP 2025 MEDICAL MACHINE TRANSLATION - BIDIRECTIONAL PIPELINE

Step 1: DATA PREPARATION - BIDIRECTIONAL MODE
--------------------------------------------------------------------------------
  ƒêang load d·ªØ li·ªáu t·ª´ train.en.txt v√† train.vi.txt...
  ƒê√£ load 344471 c·∫∑p c√¢u
  Chuy·ªÉn ƒë·ªïi sang format BIDIRECTIONAL (EN‚ÜîVI)...
  ƒê√£ nh√¢n ƒë√¥i d·ªØ li·ªáu: 344471 c·∫∑p ‚Üí 688942 m·∫´u
  Chia train/val v·ªõi t·ª∑ l·ªá 90.0%/10.0%...
  ƒê√£ l∆∞u:
   - Train: \processed_new\train_bidirectional.json (620047 m·∫´u)
   - Val: \processed_new\val_bidirectional.json (68895 m·∫´u)
  ƒêang load test data t·ª´ public_test.en.txt v√† public_test.vi.txt...
‚úì ƒê√£ load 2997 c·∫∑p c√¢u test
üíæ ƒê√£ l∆∞u test: \processed_new\test_en2vi.json (2997 m·∫´u)
  ƒêang load test data t·ª´ public_test.vi.txt v√† public_test.en.txt...
‚úì ƒê√£ load 2997 c·∫∑p c√¢u test
üíæ ƒê√£ l∆∞u test: \processed_new\test_vi2en.json (2997 m·∫´u)

Step 2: MODEL TRAINING - BIDIRECTIONAL
---------------------

  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"{DEVICE_TYPE}:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.9.9: Fast Qwen3 patching. Transformers: 4.56.2.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 12.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.9.1+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


   C·∫•u h√¨nh LoRA adapters...


Unsloth 2025.9.9 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


   Model ƒë√£ s·∫µn s√†ng!
   ƒêang load d·ªØ li·ªáu t·ª´ /processed_new/train_bidirectional.json...
   ƒê√£ load 620047 m·∫´u
   ƒêang load d·ªØ li·ªáu t·ª´ /processed_new/val_bidirectional.json...
   ƒê√£ load 68895 m·∫´u

   Th√¥ng tin hu·∫•n luy·ªán:
   - Train samples: 620047
   - Val samples: 68895
   - Epochs: 2
   - Batch size: 8
   - Gradient accumulation: 2
   - Effective batch size: 16
   - Learning rate: 0.0002
   - Use_BF16: True


Unsloth: Tokenizing ["text"]:   0%|          | 0/620047 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"]:   0%|          | 0/68895 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.



   B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 620,047 | Num Epochs = 2 | Total steps = 77,506
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 17,432,576 of 1,738,007,552 (1.00% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
1000,1.037,1.279501
2000,1.0342,1.237849
3000,0.9868,1.217775
4000,1.0239,1.201229
5000,0.9804,1.18644
6000,0.938,1.176879
7000,0.9502,1.168248
8000,0.906,1.161756
9000,0.9737,1.156188


Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


KeyboardInterrupt: 

In [None]:
# !zip -r vlsp_qwen2.5_3b_medical_bidirectional.zip vlsp_qwen2.5_3b_medical_bidirectional

In [None]:
print("\nStep 3: EVALUATION - BOTH DIRECTIONS")
print("-" * 80)

evaluator = MedicalMTEvaluator(
    model_dir=config['output_dir'],
    max_seq_length=config['max_seq_length']
)

evaluator.load_finetuned_model()

# Evaluate EN->VI
print("\nEvaluating EN->VI...")
eval_results_en2vi = evaluator.evaluate_full_pipeline(
    test_file=f"{config['processed_dir']}/test_en2vi.json",
    output_dir=f"{config['eval_dir']}/en2vi"
)

# Evaluate VI->EN
print("\nEvaluating VI->EN...")
eval_results_vi2en = evaluator.evaluate_full_pipeline(
    test_file=f"{config['processed_dir']}/test_vi2en.json",
    output_dir=f"{config['eval_dir']}/vi2en"
)

# Step 4: Generate Report
print("\nStep 4: GENERATE REPORT")
print("-" * 80)

report_file = f"{config['eval_dir']}/final_report_bidirectional.txt"
generate_report(config, data_stats, training_time, 
               eval_results_en2vi, eval_results_vi2en, report_file)

print("\n" + "=" * 80)
print("BIDIRECTIONAL PIPELINE COMPLETED SUCCESSFULLY")
print("=" * 80)
print(f"Model saved to: {config['output_dir']}")
print(f"Evaluation results saved to: {config['eval_dir']}")
print(f"Final report saved to: {report_file}")
print(f"EN->VI BLEU Score: {eval_results_en2vi['bleu']['BLEU']:.2f}")
print(f"VI->EN BLEU Score: {eval_results_vi2en['bleu']['BLEU']:.2f}")
avg_bleu = (eval_results_en2vi['bleu']['BLEU'] + eval_results_vi2en['bleu']['BLEU']) / 2
print(f"Average BLEU Score: {avg_bleu:.2f}")
print("=" * 80)


Step 3: EVALUATION - BOTH DIRECTIONS
--------------------------------------------------------------------------------
   ƒêang load model t·ª´ ./vlsp_qwen3_1.7b_medical_bidirectional
==((====))==  Unsloth 2025.9.9: Fast Qwen3 patching. Transformers: 4.56.2.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 12.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.9.1+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
   Model ƒë√£ s·∫µn s√†ng cho inference

Evaluating EN->VI...

üìç Detected direction: EN2VI
  ƒêang d·ªãch test set: /processed_new/test_en2vi.json
  Direction: en2vi
  Instruction: Translate the following medical text from English to Vietnamese.
   ƒê√£ d·ªãch: 50/2997 samples
   ƒê√£ d·ªãch: 100/2997 samples
   ƒê√£ d·ªãch: 150/2997 samples