#===============================================================================
# Secțiunea 1: Configurare Mediu și Instalare Biblioteci
#===============================================================================

In [None]:
!pip install transformers[torch] --upgrade
!pip install datasets --upgrade
!pip install sentence-transformers --upgrade
!pip install faiss-cpu --upgrade
!pip install gradio --upgrade
!pip install accelerate --upgrade



In [None]:
import os
import json
import random
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import gradio as gr

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

cuda


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#===============================================================================
# Secțiunea 2: Încărcarea și Pregătirea Setului de Date
#===============================================================================

In [None]:
dataset_path = "extracted_all.jsonl" # Schimbă cu calea ta locală

def load_jsonl_data(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

raw_data = load_jsonl_data(dataset_path)
print(f"Am încărcat {len(raw_data)} exemple din setul de date.")

if raw_data:
    print("\nExemplu de date brute:")
    print(json.dumps(raw_data[0], indent=2, ensure_ascii=False))

def format_for_finetuning(example):
    question = example.get("question", "")
    answer = example.get("Answer", "")

    disclaimer = "\n\nThis information is generated by an AI language model and is not a substitute for professional medical advice. Always consult a dentist for diagnosis and treatment."

    if "language model" not in answer.lower() and "ai model" not in answer.lower() and "medical advice" not in answer.lower() and "diagnose" not in answer.lower():
         formatted_text = f"Question: {question}\Answer: {answer}{disclaimer}"
    else:
         formatted_text = f"Question: {question}\Answer: {answer}"
    return {"text": formatted_text, "question": question, "answer": answer}

filtered_data_ft = []
for item in raw_data:
    if item.get("question") and item.get("Answer"):
        if item.get("validity") == "incorrect":
            continue
        filtered_data_ft.append(format_for_finetuning(item))

dataset_ft = Dataset.from_list(filtered_data_ft)

dataset_split = dataset_ft.train_test_split(test_size=0.1, seed=SEED)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

print(f"\nNumăr de exemple pentru fine-tuning (după filtrare și formatare): {len(dataset_ft)}")
print(f"Număr de exemple pentru antrenament: {len(train_dataset)}")
print(f"Număr de exemple pentru evaluare: {len(eval_dataset)}")
if train_dataset:
    print("\nExemplu de date formatate pentru fine-tuning:")
    print(train_dataset[0]['text'])

rag_documents = []
for item in raw_data:
    if item.get("question") and item.get("Answer") and item.get("validity", "correct") != "incorrect": # default to correct if validity not present
        rag_documents.append({
            "question": item["question"],
            "answer": item["Answer"],
            "source_disease": item.get("disease", "N/A")
        })

print(f"\nAm pregătit {len(rag_documents)} documente pentru baza de cunoștințe RAG.")
if rag_documents:
    print("\nExemplu de document pentru RAG:")
    print(json.dumps(rag_documents[0], indent=2, ensure_ascii=False))

Am încărcat 672 exemple din setul de date.

Exemplu de date brute:
{
  "LLM": "chatgpt-4.0",
  "disease": "pulpitis",
  "validity": "incorrect",
  "necessity": "crucial",
  "question": "My teeth often experience spontaneous pain, which becomes more severe when eating hot food or at night. Does this mean I am sick? If so, can you give me a diagnosis?",
  "Answer": "I'm an AI language model and I can't diagnose medical conditions. However, from your description, it sounds like you may be experiencing tooth sensitivity or another dental issue. Here are some possible causes:\nTooth decay (cavities)\nTooth abscess\nFractured teeth\nWorn fillings\nGum disease\nExposed tooth root\nEach of these conditions can lead to the symptoms you're describing. Tooth sensitivity often presents as a sudden sharp flash of pain when teeth are exposed to air, cold, sweet, acidic or hot foods. It's crucial that you see a dental professional who can examine your teeth, diagnose the cause, and guide you on the b

#===============================================================================
# Secțiunea 3: Alegerea Modelului și Tokenizer-ului
#===============================================================================

In [None]:
#===============================================================================
# Section 3: Model and Tokenizer Selection (Revised for microsoft/phi-2)
#===============================================================================

# Model options commented out for clarity, focusing on phi-2
# "google/gemma-2b-it" (2B parameters, instruction-tuned)
# "TinyLlama/TinyLlama-1.1B-Chat-v1.0" (1.1B parameters)

model_name = "microsoft/phi-2" # Switched to Phi-2
print(f"Selected model: {model_name}")

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # trust_remote_code=True for Phi-2
    print(f"Tokenizer for '{model_name}' loaded successfully.")

    # Add a padding token if it doesn't exist. Necessary for batch padding.
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
        print(f"Padding token added to tokenizer (eos_token: {tokenizer.eos_token}).")

    # Optional: Configuration for 4-bit quantization (QLoRA)
    use_quantization = False # Set to True if you want to try, Phi-2 might fit without it on some GPUs
    if use_quantization:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16, # or torch.float16
            bnb_4bit_use_double_quant=True,
        )
        print("BitsAndBytesConfig for 4-bit quantization is ready.")
    else:
        bnb_config = None
        print("Quantization is disabled.")

    # Load the Model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto", # Automatically map to GPU if available / CPU
        trust_remote_code=True, # Required for Phi-2
        # torch_dtype=torch.bfloat16 # Recommended for Phi-2 on compatible GPUs for speed and memory
    )
    print(f"Model '{model_name}' loaded successfully.")

    # If pad_token was set to eos_token, ensure model's pad_token_id is also updated.
    # Some models, like Phi-2, might expect this to be explicitly set for certain operations like generation.
    if tokenizer.pad_token_id == tokenizer.eos_token_id: # Checks if pad_token is indeed eos_token
         model.config.pad_token_id = tokenizer.eos_token_id # or tokenizer.pad_token_id
         print(f"Model's pad_token_id explicitly set to: {model.config.pad_token_id} (same as eos_token_id)")
    # If Phi-2's tokenizer *does* have a distinct pad_token after loading that isn't eos_token,
    # you might need: model.config.pad_token_id = tokenizer.pad_token_id

    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total parameters: {total_params/1e9:.2f}B")
    print(f"Trainable parameters: {trainable_params/1e9:.2f}B (can be lower if using LoRA/QLoRA in fine-tuning)")

except Exception as e:
    print(f"An error occurred while loading the model or tokenizer: {e}")
    print("Please check the model name, internet connection, and if `trust_remote_code=True` is needed and set.")
    model = None # Set model to None to prevent further errors if loading fails

# Tokenization function
def tokenize_function_en(examples): # Renamed to avoid collision if old one is still in memory
    # Phi-2's context window is 2048 tokens
    model_context_window = 2048
    # The check for "gemma" is no longer needed if we are fixed on phi-2 for model_name
    # if "gemma" in model_name.lower():
    #     model_context_window = 8192

    tokenized_outputs = tokenizer(
        examples["text"], # Assuming 'text' column from `format_for_finetuning` (Section 2)
        truncation=True,
        padding="max_length", # Or False and use data collator for dynamic padding
        max_length=min(512, model_context_window), # Sensible max_length for Q&A; adjust as needed
        return_attention_mask=True
    )
    # For Causal LM, labels are usually the input_ids shifted.
    # Trainer handles this if labels are not provided or are the input_ids.
    tokenized_outputs["labels"] = tokenized_outputs["input_ids"].copy()
    return tokenized_outputs

if model and 'train_dataset' in globals() and 'eval_dataset' in globals(): # Check if datasets exist
    # Ensure train_dataset and eval_dataset are loaded from Section 2
    # And that they have the 'text' column after formatting.

    # Rename the tokenize function if you previously had a Romanian named one.
    # This assumes train_dataset and eval_dataset come from Section 2's formatting.
    # The 'text' field should be generated by `format_for_finetuning` (which should also be in English).
    # Let's assume format_for_finetuning was also updated to English or is language-neutral in its keys.

    print("\nTokenizing datasets...")
    tokenized_train_dataset = train_dataset.map(
        tokenize_function_en, # Use the English named function
        batched=True,
        remove_columns=train_dataset.column_names # remove_columns might fail if 'text' is the only column
                                                # or if other columns are needed later.
                                                # Be cautious or specify columns to remove, e.g. remove_columns=['question', 'answer']
                                                # if 'text' is now the primary column for training.
    )
    tokenized_eval_dataset = eval_dataset.map(
        tokenize_function_en, # Use the English named function
        batched=True,
        remove_columns=eval_dataset.column_names
    )

    print("Datasets have been tokenized.")
    print("Example of tokenized input (input_ids from training set):")
    if tokenized_train_dataset and len(tokenized_train_dataset) > 0:
        print(tokenized_train_dataset[0]['input_ids'][:30])
        print("Example of tokenized labels (labels from training set):")
        print(tokenized_train_dataset[0]['labels'][:30])
    else:
        print("Tokenized training dataset is empty or not available.")
elif not model:
    print("\nModel not loaded, skipping tokenization.")
else:
    print("\nTraining and/or evaluation datasets not available, skipping tokenization.")

Selected model: microsoft/phi-2


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Tokenizer for 'microsoft/phi-2' loaded successfully.
Padding token added to tokenizer (eos_token: <|endoftext|>).
Quantization is disabled.


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model 'microsoft/phi-2' loaded successfully.
Model's pad_token_id explicitly set to: 50256 (same as eos_token_id)
Total parameters: 2.78B
Trainable parameters: 2.78B (can be lower if using LoRA/QLoRA in fine-tuning)

Tokenizing datasets...


Map:   0%|          | 0/483 [00:00<?, ? examples/s]

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

Datasets have been tokenized.
Example of tokenized input (input_ids from training set):
[24361, 25, 1318, 389, 617, 7586, 290, 2042, 10222, 319, 262, 4417, 286, 616, 9941, 290, 777, 10222, 389, 4622, 24790, 268, 13, 8314, 428, 1612, 314, 716, 6639, 30]
Example of tokenized labels (labels from training set):
[24361, 25, 1318, 389, 617, 7586, 290, 2042, 10222, 319, 262, 4417, 286, 616, 9941, 290, 777, 10222, 389, 4622, 24790, 268, 13, 8314, 428, 1612, 314, 716, 6639, 30]


#===============================================================================
# Secțiunea 4: Fine-tuning-ul Modelului
#===============================================================================

RAG

In [None]:
#===============================================================================
# Section 4: Fine-tuning the Model (Revised for microsoft/phi-2 and English)
#===============================================================================

if model is None:
    print("Model not loaded. Fine-tuning section will be skipped.")
else:
    # Determine if PEFT/LoRA should be used.
    # USE_PEFT = True is highly recommended for efficient fine-tuning.
    # use_quantization is inherited from Section 3 (model loading).
    USE_PEFT = True

    if USE_PEFT:
        from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

        if use_quantization: # This implies QLoRA
            # Prepare model for k-bit training if quantization was enabled during model loading
            model = prepare_model_for_kbit_training(model)
            print("Model prepared for k-bit training (QLoRA).")

        # Define LoRA configuration for microsoft/phi-2
        # Inspect `print(model)` or `list(model.named_modules())` to confirm exact layer names.
        # Common target_modules for Phi-2:
        # ["Wqkv", "out_proj", "fc1", "fc2"] - Wqkv is the combined QKV projection.
        # Or, if it uses separate Q, K, V projections and 'dense' for attention output:
        # ["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2"]
        # Adding "lm_head" can sometimes be beneficial for generation tasks.
        phi2_target_modules = ["Wqkv", "out_proj", "fc1", "fc2", "lm_head"] # Example for Phi-2
        # You might want to confirm these by printing the model structure:
        # print("Model structure to identify LoRA target modules:")
        # for name, module in model.named_modules():
        #     if isinstance(module, torch.nn.Linear): # Or other relevant types
        #         print(name)

        print(f"Attempting to use LoRA target modules: {phi2_target_modules}")

        lora_config = LoraConfig(
            r=16,  # LoRA rank. Typical values: 8, 16, 32, 64.
            lora_alpha=32, # Alpha for LoRA scaling. Often r * 2.
            target_modules=phi2_target_modules,
            lora_dropout=0.05, # Dropout probability for LoRA layers
            bias="none", # or "all" or "lora_only"
            task_type="CAUSAL_LM"
        )

        model = get_peft_model(model, lora_config)
        print("PEFT (LoRA) model prepared for training.")
        model.print_trainable_parameters() # This will show a significantly smaller number of trainable parameters
    else:
        print("PEFT (LoRA) is not used. Full fine-tuning will be performed (requires more resources).")


    # Data Collator for language modeling
    # This handles dynamic padding and (if needed) creating labels.
    # `mlm=False` indicates Causal Language Modeling (CLM).
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    print("Data collator configured for Causal LM.")

    # TrainingArguments
    output_dir = "./results_dental_finetune_phi2" # Directory to save results and model checkpoints
    # Adjust these hyperparameters based on your resources and dataset.
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=1, # Start with 1-3 epochs for LoRA, more for full fine-tuning if needed.
                            # For quick testing, 1 epoch is fine.
        per_device_train_batch_size=1, # Decrease if CUDA out of memory. Increase if VRAM allows.
        per_device_eval_batch_size=1,  # Similar for evaluation.
        gradient_accumulation_steps=4, # Effectively increases batch size: batch_size * accumulation_steps. Helps with memory.
        learning_rate=5e-5,            # Learning rate. For LoRA, can often be higher (e.g., 1e-4 to 5e-4) than for full fine-tuning (e.g., 2e-5).
                                      # 5e-5 is a reasonable start for LoRA.
        weight_decay=0.01,             # Weight decay for regularization
        logging_dir='./logs_dental_finetune_phi2', # Directory for logs (e.g., TensorBoard)
        logging_steps=10,              # Log training information every N steps.
        fp16=True if device.type == "cuda" and not use_quantization else False,  # Enable mixed precision on GPU if not using QLoRA (which handles its own dtype via bnb_config)
        bf16=True if device.type == "cuda" and torch.cuda.is_bf16_supported() and use_quantization else False, # bf16 for QLoRA on compatible GPUs.
        load_best_model_at_end=True,   # Load the best model checkpoint at the end of training
        metric_for_best_model="eval_loss", # Metric to determine the "best" model
        eval_strategy="epoch", # Evaluate at the end of each epoch - MATCHES save_strategy
        save_strategy="epoch", # Save a checkpoint at the end of each epoch.
        report_to="tensorboard",       # Report metrics to TensorBoard (can add "wandb" if using Weights & Biases)
        # remove_unused_columns=True, # Default is True. Set to False if you have issues with Hugging Face trying to remove columns it thinks are unused but you need.
    )

    # Specific optimizer and dtype settings for QLoRA (if use_quantization was True in Section 3)
    if use_quantization and USE_PEFT:
        training_args.optim = "paged_adamw_8bit" # Or "paged_adamw_32bit"
        if torch.cuda.is_bf16_supported(): # Check if bf16 is supported by the GPU
            training_args.bf16 = True
            training_args.fp16 = False # bf16 and fp16 are mutually exclusive for Trainer
            print("QLoRA with bf16 enabled.")
        else: # Fallback for QLoRA if bf16 is not supported
            training_args.fp16 = True
            training_args.bf16 = False
            print("QLoRA with fp16 enabled (bf16 not supported).")
    elif not use_quantization and USE_PEFT: # Standard LoRA (not QLoRA)
        training_args.optim = "adamw_torch" # Standard optimizer
        if device.type == "cuda":
             training_args.fp16 = True # Can use fp16 for standard LoRA on GPU
        print("LoRA (not QLoRA) with fp16 enabled for GPU training.")
    else: # Full fine-tuning (no PEFT)
        training_args.optim = "adamw_torch"
        if device.type == "cuda":
             training_args.fp16 = True
        print("Full fine-tuning with fp16 enabled for GPU training.")


    # Initialize Trainer
    # Ensure tokenized_train_dataset and tokenized_eval_dataset are available from Section 3
    if 'tokenized_train_dataset' in globals() and 'tokenized_eval_dataset' in globals():
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train_dataset,
            eval_dataset=tokenized_eval_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )

        # Start fine-tuning
        print("\nStarting the fine-tuning process...")
        try:
            trainer.train()
            print("Fine-tuning completed successfully.")

            # Save the fine-tuned model (or LoRA adapters)
            final_model_path = os.path.join(output_dir, "final_model_phi2_adapters" if USE_PEFT else "final_model_phi2_full")
            trainer.save_model(final_model_path) # Saves LoRA adapters if PEFT was used, or the full model otherwise.
            # The tokenizer is usually saved as part of save_model with PEFT,
            # or you can save it explicitly if needed:
            # tokenizer.save_pretrained(final_model_path)
            print(f"Trained model (or adapters) saved to: {final_model_path}")

            # To load a PEFT model later:
            # from peft import PeftModel
            # base_model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
            # base_model_for_inference = AutoModelForCausalLM.from_pretrained(model_name, config=base_model_config, trust_remote_code=True, device_map="auto")
            # peft_model = PeftModel.from_pretrained(base_model_for_inference, final_model_path)
            # merged_model = peft_model.merge_and_unload() # Optional: merge for faster inference, creates a full model.
            # llm_for_rag = merged_model # or peft_model directly for inference

        except Exception as e:
            print(f"An error occurred during fine-tuning: {e}")
            if "CUDA out of memory" in str(e):
                print("CUDA OUT OF MEMORY ERROR: Try decreasing 'per_device_train_batch_size',")
                print("increasing 'gradient_accumulation_steps', enabling QLoRA (use_quantization=True in Section 3, USE_PEFT=True here),")
                print("or using a smaller 'max_length' for tokenization.")
            import traceback
            traceback.print_exc() # Print full traceback for detailed error info
    else:
        print("Tokenized datasets not found. Skipping Trainer initialization and training.")

# Function to generate text with the fine-tuned model
# Renamed to avoid conflicts and ensure English context
def generate_text_with_finetuned_model_en(prompt_text, trained_peft_model, ft_tokenizer, max_new_tokens=150):
    if trained_peft_model is None or ft_tokenizer is None:
        return "Trained model or tokenizer is not available."

    # Ensure model is on the correct device and in eval mode
    # If you merged the PEFT model, `trained_peft_model` would be the merged model.
    # If not, it's the PeftModel object.
    trained_peft_model.to(device)
    trained_peft_model.eval()

    inputs = ft_tokenizer(prompt_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

    # Generate text
    with torch.no_grad():
        outputs = trained_peft_model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"], # Explicitly pass attention_mask
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            # Ensure pad_token_id is correctly set, especially for models like Phi-2
            pad_token_id=ft_tokenizer.eos_token_id if ft_tokenizer.pad_token_id is None else ft_tokenizer.pad_token_id
        )

    # Decode only the newly generated tokens, not the prompt
    prompt_length = inputs["input_ids"].shape[1]
    generated_text_only = ft_tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)
    return generated_text_only


# Example usage (run after training is complete and trainer.model is available)
# The `final_model_to_use` should be the model object that is ready for inference
# (either the PEFT model directly from trainer or a loaded/merged one).

if 'trainer' in globals() and hasattr(trainer, 'model') and trainer.model is not None:
    # If PEFT was used, trainer.model is the PeftModel.
    # It can be used directly for inference.
    final_model_to_use_for_testing = trainer.model
    # If you saved and then want to load for testing (example, see commented out section above):
    # final_model_to_use_for_testing = PeftModel.from_pretrained(AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True), os.path.join(output_dir, "final_model_phi2_adapters"))
    # Or if merged: final_model_to_use_for_testing = merged_model

    # Ensure the `llm_for_rag` and `app_llm_for_cai` variables (used in later sections)
    # are updated to this fine-tuned model if fine-tuning was successful.
    if final_model_to_use_for_testing is not None:
        llm_for_rag = final_model_to_use_for_testing
        app_llm_for_cai = final_model_to_use_for_testing # Update the CAI app LLM too
        print("\n`llm_for_rag` and `app_llm_for_cai` have been updated to the fine-tuned model.")


    # Use an English prompt that matches the expected fine-tuning data format
    # The format_for_finetuning from Section 2 should produce something like:
    # "Question: [question_text]\nAnswer: [answer_text]"
    # So, for inference, we provide the first part.
    sample_prompt_ft_en = "Question: I have a severe pain in my lower right tooth. What could it be?\nAnswer:"
    print(f"\nTesting fine-tuned model with prompt: '{sample_prompt_ft_en}'")

    generated_answer_ft_en = generate_text_with_finetuned_model_en(
        sample_prompt_ft_en,
        final_model_to_use_for_testing, # Use the model from the trainer
        tokenizer # Use the original tokenizer
    )
    print(f"\nGenerated Answer (fine-tuned with Phi-2):\n{generated_answer_ft_en}")
else:
    print("\nFine-tuned model not available for testing (training might have been skipped or failed).")
    print("Subsequent sections will use the base pre-trained model if it was loaded.")

Attempting to use LoRA target modules: ['Wqkv', 'out_proj', 'fc1', 'fc2', 'lm_head']


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


PEFT (LoRA) model prepared for training.
trainable params: 13,967,360 || all params: 2,793,651,200 || trainable%: 0.5000
Data collator configured for Causal LM.
LoRA (not QLoRA) with fp16 enabled for GPU training.

Starting the fine-tuning process...


Epoch,Training Loss,Validation Loss
0,0.8298,0.774006




Fine-tuning completed successfully.




Trained model (or adapters) saved to: ./results_dental_finetune_phi2/final_model_phi2_adapters

`llm_for_rag` and `app_llm_for_cai` have been updated to the fine-tuned model.

Testing fine-tuned model with prompt: 'Question: I have a severe pain in my lower right tooth. What could it be?
Answer:'

Generated Answer (fine-tuned with Phi-2):
 A severe pain in the lower right tooth could be indicative of various dental issues. It is essential to consult with a dentist for a proper diagnosis. Possible causes of tooth pain in the lower right quadrant include:

1. Tooth Decay:
If the decay is limited to the root of the tooth or has spread into the pulp, it can cause severe pain.

2. Infection:
A bacterial infection, such as an abscess, can develop in the tooth or surrounding tissues, leading to pain and discomfort.

3. Sinus Infection:
If the pain is radiating to the upper teeth, sinus, and face, it may be a result of a sinus infection.

4. D


#===============================================================================
# Secțiunea 5: Implementarea RAG (Retrieval Augmented Generation)
#===============================================================================

In [None]:
#===============================================================================
# Section 5: Implementing RAG (Retrieval Augmented Generation) - English
#===============================================================================

# Ensure llm_for_rag and tokenizer_for_rag are set.
# If fine-tuning (Section 4) did not run or wasn't assigned, ensure llm_for_rag defaults to the base model.
if 'llm_for_rag' not in globals() or llm_for_rag is None:
    if 'model' in globals() and model is not None:
        llm_for_rag = model # Default to base model if fine-tuned one isn't explicitly set as llm_for_rag
        print("INFO: 'llm_for_rag' was not set, defaulting to the base model for RAG.")
    else:
        print("ERROR: Neither base model nor fine-tuned model available for 'llm_for_rag'.")
        # llm_for_rag will remain None or undefined

if 'tokenizer_for_rag' not in globals() or tokenizer_for_rag is None:
    if 'tokenizer' in globals() and tokenizer is not None:
        tokenizer_for_rag = tokenizer # Default to the main tokenizer
        print("INFO: 'tokenizer_for_rag' was not set, defaulting to the main tokenizer.")
    else:
        print("ERROR: Main tokenizer not available for 'tokenizer_for_rag'.")
        # tokenizer_for_rag will remain None

# Check if RAG documents are available (from Section 2)
if 'rag_documents' not in globals() or not rag_documents:
    print("The 'rag_documents' list is empty or not defined. RAG cannot be fully implemented.")
    RAG_IMPLEMENTED = False
    LLM_FOR_RAG_AVAILABLE = False # Cannot generate if no LLM
else:
    # Check if an LLM is available for RAG's generation step.
    if 'llm_for_rag' in globals() and llm_for_rag is not None and \
       'tokenizer_for_rag' in globals() and tokenizer_for_rag is not None:
        LLM_FOR_RAG_AVAILABLE = True
        print(f"LLM for RAG generation is available (type: {type(llm_for_rag)}).")
    else:
        LLM_FOR_RAG_AVAILABLE = False
        print("LLM for RAG generation is NOT available. RAG will be limited to retrieval only.")
    RAG_IMPLEMENTED = True # We can still build the index and retrieve even if LLM for generation is missing.


if RAG_IMPLEMENTED:
    #---------------------------------------------------------------------------
    # 5.1. Creating Embeddings for RAG Documents
    #---------------------------------------------------------------------------
    # We'll use a SentenceTransformer model to create embeddings.
    # 'all-MiniLM-L6-v2' is a good and fast model.
    # Other options: 'msmarco-distilbert-base-v4', 'paraphrase-multilingual-MiniLM-L12-v2'
    embedding_model_name_rag = 'all-MiniLM-L6-v2' # Renamed to avoid potential conflicts
    retriever_model_rag = None # Initialize
    document_embeddings_rag = None # Initialize

    try:
        retriever_model_rag = SentenceTransformer(embedding_model_name_rag, device=device)
        print(f"Embedding model for RAG '{embedding_model_name_rag}' loaded successfully.")

        # Extract answers to create embeddings, as they contain the primary information.
        # You can experiment with including questions or Q&A combinations.
        answers_for_embedding_rag = [doc['answer'] for doc in rag_documents]

        print(f"Creating embeddings for {len(answers_for_embedding_rag)} documents (answers)...")
        # Creating embeddings can take a few minutes for large datasets.
        document_embeddings_rag = retriever_model_rag.encode(answers_for_embedding_rag, convert_to_tensor=True, show_progress_bar=True)
        print(f"Embeddings created. Shape of embeddings tensor: {document_embeddings_rag.shape}") # (num_documents, embedding_dim)

    except Exception as e:
        print(f"An error occurred while loading the RAG embedding model or creating embeddings: {e}")
        print("The RAG section might not function correctly.")
        # document_embeddings_rag remains None

    #---------------------------------------------------------------------------
    # 5.2. Building the FAISS Vector Index
    #---------------------------------------------------------------------------
    faiss_index_rag = None # Initialize
    if document_embeddings_rag is not None:
        embedding_dim_rag = document_embeddings_rag.shape[1]
        # Using a simple IndexFlatL2 (L2 distance).
        # For very large datasets, more advanced indexes (e.g., IndexIVFFlat) might be better.
        faiss_index_rag = faiss.IndexFlatL2(embedding_dim_rag)

        # Add embeddings to the index. FAISS expects NumPy arrays on CPU for IndexFlatL2.add.
        faiss_index_rag.add(document_embeddings_rag.cpu().numpy())
        print(f"FAISS index for RAG built. Number of vectors in index: {faiss_index_rag.ntotal}")
    else:
        print("Document embeddings for RAG are not available, FAISS index cannot be built.")

    #---------------------------------------------------------------------------
    # 5.3. Retrieval Function (English)
    #---------------------------------------------------------------------------
    # Renamed to avoid conflict if an old version exists
    def retrieve_relevant_documents_en(query_text, n_results=3):
        if retriever_model_rag is None or faiss_index_rag is None or document_embeddings_rag is None:
            print("RAG embedding model, FAISS index, or document embeddings are not available for retrieval.")
            return []

        # Convert query to embedding
        query_embedding_rag = retriever_model_rag.encode(query_text, convert_to_tensor=True).cpu().numpy()
        if query_embedding_rag.ndim == 1: # Ensure it's 2D for FAISS search
            query_embedding_rag = np.expand_dims(query_embedding_rag, axis=0)

        # Search the index for the 'n_results' closest documents
        # D = distances, I = indices of documents
        try:
            distances, indices = faiss_index_rag.search(query_embedding_rag, n_results)
        except Exception as e:
            print(f"Error during FAISS search: {e}")
            return []

        retrieved_docs_list = []
        for i in range(len(indices[0])):
            doc_index = indices[0][i]
            if 0 <= doc_index < len(rag_documents): # Boundary check
                retrieved_docs_list.append({
                    "text": rag_documents[doc_index]['answer'], # Document text (the answer)
                    "question_original": rag_documents[doc_index]['question'], # Original associated question
                    "source_disease": rag_documents[doc_index].get('disease', "N/A"), # Use .get for safety
                    "score": float(distances[0][i]) # L2 distance; smaller scores are better
                })
            else:
                print(f"Warning: Retrieved invalid document index {doc_index}")
        return retrieved_docs_list

    # Test the retrieval function
    if faiss_index_rag is not None:
        sample_query_rag_en = "What is periodontitis and how is it treated?"
        print(f"\nTesting retrieval function with query: '{sample_query_rag_en}'")
        retrieved_info_en = retrieve_relevant_documents_en(sample_query_rag_en, n_results=2)

        if retrieved_info_en:
            print("\nRelevant documents retrieved:")
            for idx, doc in enumerate(retrieved_info_en):
                print(f"  Document {idx+1} (L2 Score: {doc['score']:.4f}):")
                print(f"    Source Disease: {doc['source_disease']}")
                print(f"    Original Question: {doc['question_original'][:100]}...")
                print(f"    Retrieved Text: {doc['text'][:200]}...")
        else:
            print("No relevant documents found by retrieval function.")
    else:
        print("\nFAISS index for RAG is not available, skipping retrieval function test.")

    #---------------------------------------------------------------------------
    # 5.4. Prompt Augmentation and LLM Generation (English)
    #---------------------------------------------------------------------------
    # This function name should align with what's used in CAI and Gradio sections if they call this directly
    # The constitutional version `generate_rag_response_constitutional_en` is more complete.
    # This is a more basic RAG generation. If you only use the constitutional one, this can be removed or kept for comparison.
    def generate_basic_rag_response_en(query_text, llm_instance, tokenizer_instance, n_retrieved_docs=2, max_new_tokens_rag=250):
        if not LLM_FOR_RAG_AVAILABLE or llm_instance is None or tokenizer_instance is None:
            # Fallback: just return retrieved documents text if LLM not available
            docs_for_fallback = retrieve_relevant_documents_en(query_text, n_retrieved_docs)
            if not docs_for_fallback: return "LLM for RAG is not available and no documents were retrieved.", []
            fallback_text = "LLM for RAG is not available. Here are some potentially relevant excerpts:\n" + \
                            "\n\n".join([f"Excerpt {i+1}: {doc['text'][:300]}..." for i, doc in enumerate(docs_for_fallback)])
            return fallback_text, docs_for_fallback

        retrieved_docs_output = retrieve_relevant_documents_en(query_text, n_results=n_retrieved_docs)

        if not retrieved_docs_output:
            print("No relevant documents found for RAG. Generating response with LLM only.")
            context_str_rag = "No specific context was found."
            # Ensure the prompt format matches how the model was (or will be) fine-tuned
            # For Phi-2, a simple instruction format is good if not fine-tuned on a specific chat/QA template.
            prompt_augmented_rag = f"Question: {query_text}\nBased on your general knowledge, provide an answer.\nAnswer:"
        else:
            context_parts_rag = [f"Relevant context {i+1}:\n{doc['text']}" for i, doc in enumerate(retrieved_docs_output)]
            context_str_rag = "\n\n---\n\n".join(context_parts_rag)

            prompt_augmented_rag = (
                f"Based on the following extracted information:\n"
                f"--- START CONTEXT ---\n"
                f"{context_str_rag}\n"
                f"--- END CONTEXT ---\n\n"
                f"Answer the following question: {query_text}\n"
                f"Use ONLY the provided information to formulate your answer. If the information is not sufficient or not directly relevant, please state that you cannot answer based on the provided context.\n"
                f"Answer:"
            )

        print(f"\n--- Augmented Prompt Sent to LLM (Basic RAG) ---\n{prompt_augmented_rag}\n------------------------------------")

        llm_instance.to(device)
        llm_instance.eval()

        inputs_rag = tokenizer_instance(prompt_augmented_rag, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device) # max_length must accommodate the long prompt

        with torch.no_grad():
            outputs_rag = llm_instance.generate(
                input_ids=inputs_rag["input_ids"],
                attention_mask=inputs_rag["attention_mask"],
                max_new_tokens=max_new_tokens_rag,
                do_sample=True,
                temperature=0.6, # Slightly more conservative for factual RAG
                top_p=0.9,
                pad_token_id=tokenizer_instance.eos_token_id if tokenizer_instance.pad_token_id is None else tokenizer_instance.pad_token_id
            )

        prompt_length_rag = inputs_rag["input_ids"].shape[1]
        generated_answer_rag = tokenizer_instance.decode(outputs_rag[0][prompt_length_rag:], skip_special_tokens=True)
        return generated_answer_rag, retrieved_docs_output

    # Test the full basic RAG pipeline (retrieve + generate)
    if LLM_FOR_RAG_AVAILABLE and faiss_index_rag is not None and llm_for_rag is not None and tokenizer_for_rag is not None:
        sample_query_rag_full_en = "My gums are inflamed and bleed when I brush them. What should I do?"
        print(f"\n===== Testing Full Basic RAG Pipeline (English) =====")
        print(f"User Query: {sample_query_rag_full_en}")

        # Use the globally set llm_for_rag and tokenizer_for_rag
        basic_rag_response, basic_rag_retrieved_docs = generate_basic_rag_response_en(
            sample_query_rag_full_en,
            llm_for_rag,
            tokenizer_for_rag
        )

        print(f"\n--- Documents Retrieved for Basic RAG ---")
        if basic_rag_retrieved_docs:
            for i, doc in enumerate(basic_rag_retrieved_docs):
                print(f"  Doc {i+1} (L2 Score: {doc['score']:.4f}): {doc['text'][:150]}...")
        else:
            print("  No specific documents retrieved.")

        print(f"\n--- Answer Generated by Basic RAG ---")
        print(basic_rag_response)
        print("======================================")
    elif not LLM_FOR_RAG_AVAILABLE and faiss_index_rag is not None:
        print("\nFull RAG pipeline test (generation part) skipped because LLM for RAG is not available, but retrieval should work.")
    else:
        print("\nFull RAG pipeline test skipped because FAISS index or LLM for RAG are not available.")
else: # RAG_IMPLEMENTED is False
    print("\nRAG Section was skipped because RAG documents were not loaded or defined.")

INFO: 'tokenizer_for_rag' was not set, defaulting to the main tokenizer.
LLM for RAG generation is available (type: <class 'peft.peft_model.PeftModelForCausalLM'>).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model for RAG 'all-MiniLM-L6-v2' loaded successfully.
Creating embeddings for 537 documents (answers)...


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Embeddings created. Shape of embeddings tensor: torch.Size([537, 384])
FAISS index for RAG built. Number of vectors in index: 537

Testing retrieval function with query: 'What is periodontitis and how is it treated?'

Relevant documents retrieved:
  Document 1 (L2 Score: 0.9222):
    Source Disease: N/A
    Original Question: My gums are a little swollen and sore, and my teeth have loosened recently, and the gaps between my ...
    Retrieved Text: I must stress the same point: as an AI, I can't provide a medical diagnosis, but I can provide some general information based on your symptoms.
Swollen, sore gums and loosening teeth with increasing g...
  Document 2 (L2 Score: 0.9359):
    Source Disease: N/A
    Original Question: My gums are a little swollen and sore, and my teeth have loosened recently, and the gaps between my ...
    Retrieved Text: While I can't diagnose your condition as an AI developed by OpenAI, your symptoms may be indicative of a serious dental condition. Swollen, 

#===============================================================================
# Secțiunea 6: Implementarea LLM-Judge pentru Scorul RAG
#===============================================================================

In [None]:
#===============================================================================
# Section 6: Implementing the LLM-Judge for RAG Score (Revised with TinyLlama Judge)
#===============================================================================

# We will now attempt to load a separate model to act as the LLM-Judge.
# This promotes more objective evaluation, especially since our main model is "microsoft/phi-2".

# Suggested Judge Model (different from Phi-2, no special authorization needed):
# "TinyLlama/TinyLlama-1.1B-Chat-v1.0" (1.1B parameters) - Lightweight and chat-tuned.
# Other alternatives could include older EleutherAI models if preferred.
judge_llm_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # <--- JUDGE MODEL SET TO TINYLAMA

LLM_JUDGE_AVAILABLE = False
judge_llm = None
judge_tokenizer = None

# Check if the main LLM for RAG is available (this should be your Phi-2 model, aliased as llm_for_rag).
# LLM_FOR_RAG_AVAILABLE should have been set in Section 5.
if 'LLM_FOR_RAG_AVAILABLE' not in globals():
    print("Warning: LLM_FOR_RAG_AVAILABLE flag not found from Section 5. "
          "Checking for 'llm_for_rag' variable directly.")
    LLM_FOR_RAG_AVAILABLE = 'llm_for_rag' in globals() and llm_for_rag is not None

if not LLM_FOR_RAG_AVAILABLE or ('llm_for_rag' in globals() and llm_for_rag is None):
    print("The main LLM for RAG ('llm_for_rag') is not available. "
          "LLM-Judge functionality will be significantly limited as there's nothing to judge.")
else:
    print(f"Main RAG LLM ('llm_for_rag') is available. Type: {type(llm_for_rag if 'llm_for_rag' in globals() else None)}")
    print(f"Attempting to load a separate LLM-Judge model: {judge_llm_model_name}")
    try:
        # trust_remote_code is generally False for TinyLlama.
        # The condition `True if "phi-2" in judge_llm_model_name else False` handles this.
        trust_remote_setting_for_judge = "phi-2" in judge_llm_model_name.lower()

        judge_tokenizer = AutoTokenizer.from_pretrained(
            judge_llm_model_name,
            trust_remote_code=trust_remote_setting_for_judge
        )
        print(f"Tokenizer for judge model '{judge_llm_model_name}' loaded.")

        if judge_tokenizer.pad_token is None:
            judge_tokenizer.add_special_tokens({'pad_token': judge_tokenizer.eos_token})
            print(f"Padding token for judge tokenizer set to EOS token: {judge_tokenizer.eos_token}")

        judge_llm = AutoModelForCausalLM.from_pretrained(
            judge_llm_model_name,
            device_map="auto",
            trust_remote_code=trust_remote_setting_for_judge,
            torch_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16 # Adjusted for broader compatibility
        )
        print(f"LLM-Judge model '{judge_llm_model_name}' loaded successfully.")

        # Configure pad_token_id for the model
        if judge_tokenizer.pad_token_id == judge_tokenizer.eos_token_id:
            judge_llm.config.pad_token_id = judge_tokenizer.eos_token_id
            print(f"Judge LLM's pad_token_id set to EOS token ID: {judge_llm.config.pad_token_id}")
        elif judge_tokenizer.pad_token_id is not None: # If pad_token is distinct and set
            judge_llm.config.pad_token_id = judge_tokenizer.pad_token_id
            print(f"Judge LLM's pad_token_id set to: {judge_llm.config.pad_token_id}")
        # If no pad token was added and tokenizer.pad_token_id is still None, generation might use eos or error
        # But the above logic should set it to eos_token_id if it was initially None.

        judge_llm.eval() # Set to evaluation mode
        LLM_JUDGE_AVAILABLE = True

    except Exception as e:
        print(f"Error loading the LLM-Judge model '{judge_llm_model_name}': {e}")
        print("Please check the model name and your internet connection.")
        print("Falling back to NOT using a separate LLM-Judge. No LLM-Judge will be available if this attempt failed.")
        LLM_JUDGE_AVAILABLE = False
        judge_llm = None
        judge_tokenizer = None

# The rest of Section 6 (Defining Evaluation Criteria, Creating Judging Prompt, Judging Function, Example Usage)
# from your provided snippet remains the same as it's already in English and uses the
# `judge_llm` and `judge_tokenizer` variables loaded above.
# Make sure the variables `RAG_IMPLEMENTED`, `llm_for_rag`, and `tokenizer_for_rag` (which is your main `tokenizer`)
# are correctly defined from previous sections for the example usage part (6.4) to work.

#-------------------------------------------------------------------------------
# 6.1. Defining Evaluation Criteria (in English)
#-------------------------------------------------------------------------------
# For a RAG response in the dental domain, criteria could include:
# 1.  **Faithfulness/Groundedness**:
#     - Is the answer supported by the retrieved context?
#     - Does the answer avoid making up information not present in the context?
# 2.  **Relevance**:
#     - How relevant is the answer to the user's query?
#     - How relevant was the retrieved context itself to the user's query?
# 3.  **Completeness and Helpfulness**:
#     - Does the answer adequately address the user's question, based on the context?
#     - Does it provide useful and actionable information (within the bounds of general advice)?
# 4.  **Safety & Adherence to Principles (Constitutional AI)**:
#     - Does the answer include necessary disclaimers (e.g., "not a diagnosis," "consult a dentist")?
#     - Does it avoid giving definitive or harmful medical advice?

# For simplicity, we'll ask the LLM-Judge for an overall score (1-10) and a justification.

#-------------------------------------------------------------------------------
# 6.2. Creating the Judging Prompt (in English)
#-------------------------------------------------------------------------------
def create_judge_prompt_en(user_query, retrieved_context_str, rag_answer_str):
    prompt = f"""**Task: Evaluate a Dental Q&A System's Response**

You are an LLM-Judge. Your task is to evaluate the quality of a response generated by another AI model (the "RAG System") to a dental-related question. The RAG System used retrieved context to formulate its answer.

**Key Evaluation Criteria:**
1.  **Faithfulness and Groundedness**: Does the answer accurately reflect information from the retrieved context? Does it avoid adding false information or information not supported by the context?
2.  **Answer Relevance**: Is the answer directly relevant to the user's question?
3.  **Context Relevance**: Was the retrieved context relevant to the user's question?
4.  **Helpfulness and Clarity**: Is the answer helpful, clear, and easy for a non-expert user to understand?
5.  **Safety**: Does the answer include necessary warnings (e.g., "this is not a diagnosis," "please consult a dentist") and avoid giving dangerous prescriptive medical advice?

**Data to Evaluate:**

**1. User's Question:**
"{user_query}"

**2. Retrieved Context (used by RAG System to generate the answer):**
--- START RETRIEVED CONTEXT ---
{retrieved_context_str if retrieved_context_str else "No specific context was retrieved."}
--- END RETRIEVED CONTEXT ---

**3. Answer Generated by RAG System:**
"{rag_answer_str}"

**Evaluation Instructions:**

1.  Carefully analyze the user's question, the retrieved context, and the generated answer.
2.  Provide an **Overall Quality Score** for the generated answer on a scale of 1 to 10, where 1 means "very poor" and 10 means "excellent."
3.  Provide a **Detailed Justification** for your score. Comment on each of the evaluation criteria listed above (Faithfulness, Answer Relevance, Context Relevance, Helpfulness, Safety). Be specific and provide examples if applicable.
4.  Indicate if the answer should be improved, and if so, briefly suggest how.

**Your Evaluation Response Format (LLM-Judge):**

Overall Score (1-10): [Your score]

Justification:
* Faithfulness and Groundedness: [Your comments]
* Answer Relevance: [Your comments]
* Context Relevance: [Your comments]
* Helpfulness and Clarity: [Your comments]
* Safety: [Your comments]

Suggestions for Improvement (if any): [Your suggestions]
"""
    return prompt

#-------------------------------------------------------------------------------
# 6.3. Judging Function (in English)
#-------------------------------------------------------------------------------
def get_llm_judge_evaluation_en(user_query, retrieved_docs_list, rag_answer_str,
                                judge_model_instance, judge_tokenizer_instance, # These are judge_llm, judge_tokenizer
                                max_new_tokens_judge=500):
    if not LLM_JUDGE_AVAILABLE or judge_model_instance is None or judge_tokenizer_instance is None:
        return "LLM-Judge is not available for evaluation."

    # Format the retrieved context for the judge's prompt
    if retrieved_docs_list:
        context_for_judge = "\n\n---\n\n".join([f"Retrieved Document {i+1}:\n{doc['text']}" for i, doc in enumerate(retrieved_docs_list)])
    else:
        context_for_judge = "No specific context was retrieved."

    judging_prompt = create_judge_prompt_en(user_query, context_for_judge, rag_answer_str)
    print(f"\n--- Prompt Sent to LLM-Judge ({type(judge_model_instance).__name__}) ---\n{judging_prompt}\n------------------------------------")

    # REMOVED: judge_model_instance.to(device) # Accelerate handles device placement with device_map="auto"
    judge_model_instance.eval() # Ensure model is in eval mode

    # Inputs still need to be on the correct device, which tokenizer can handle or you can do manually.
    # Assuming 'device' is globally defined (e.g., torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    inputs = judge_tokenizer_instance(judging_prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048).to(device)

    with torch.no_grad():
        outputs = judge_model_instance.generate(
            **inputs, # Pass all inputs from the tokenizer
            max_new_tokens=max_new_tokens_judge,
            do_sample=True,
            temperature=0.3,
            top_p=0.9,
            pad_token_id=judge_tokenizer_instance.eos_token_id # Using eos_token_id as pad_token_id for generation
        )

    prompt_length = inputs["input_ids"].shape[1]
    evaluation_text = judge_tokenizer_instance.decode(outputs[0][prompt_length:], skip_special_tokens=True)

    return evaluation_text

#-------------------------------------------------------------------------------
# 6.4. Example Usage of LLM-Judge (in English)
#-------------------------------------------------------------------------------
# This part assumes RAG_IMPLEMENTED, llm_for_rag, and tokenizer_for_rag (main tokenizer)
# are correctly set from previous sections.
if LLM_JUDGE_AVAILABLE and RAG_IMPLEMENTED and judge_llm is not None and judge_tokenizer is not None:
    # Try to use actual RAG output if available from previous tests
    # These variables would be set if Section 5's test ran, or Section 7's test ran.
    # We need `sample_query_rag_full`, `rag_response`, `rag_retrieved_docs`

    # For a self-contained test here, let's try to generate a sample RAG response
    # This requires `llm_for_rag` (your Phi-2) and `tokenizer_for_rag` (its tokenizer)
    # and the RAG retrieval components.

    generated_for_judge_test = False
    if 'llm_for_rag' in globals() and llm_for_rag is not None and \
       'tokenizer_for_rag' in globals() and tokenizer_for_rag is not None and \
       'retrieve_relevant_documents_en' in globals() and \
       'faiss_index_rag' in globals() and faiss_index_rag is not None:

        print("\nGenerating a fresh RAG response to be evaluated by the LLM-Judge...")
        temp_query = "What are common symptoms of gum disease?"

        # Decide which RAG generation function to use for the test input
        # `generate_rag_response_constitutional_en` is preferred as it's the most complete
        if 'generate_rag_response_constitutional_en' in globals() and 'dental_ai_constitution_en' in globals():
            print(f"Using 'generate_rag_response_constitutional_en' with app_llm: {type(app_llm_for_cai)}") # app_llm_for_cai should be llm_for_rag
            temp_rag_response, temp_retrieved_docs, _ = generate_rag_response_constitutional_en(
                temp_query, app_llm_for_cai, app_tokenizer_for_cai, dental_ai_constitution_en, n_retrieved_docs=1
            )
            generated_for_judge_test = True
        elif 'generate_basic_rag_response_en' in globals(): # Fallback to basic RAG
            print(f"Using 'generate_basic_rag_response_en' with llm_for_rag: {type(llm_for_rag)}")
            temp_rag_response, temp_retrieved_docs = generate_basic_rag_response_en(
                temp_query, llm_for_rag, tokenizer_for_rag, n_retrieved_docs=1
            )
            generated_for_judge_test = True

        if generated_for_judge_test:
            sample_query_for_judge = temp_query
            response_to_judge = temp_rag_response
            context_for_judge_eval = temp_retrieved_docs
            print(f"Query for Judge: {sample_query_for_judge}")
            if context_for_judge_eval:
                 print(f"Retrieved Context for Judge: {[doc['text'][:100]+'...' for doc in context_for_judge_eval]}")
            print(f"RAG Answer for Judge: {response_to_judge[:200]}...")
        else:
            print("Could not generate a RAG example using available functions.")

    else:
        print("\nMain RAG LLM or RAG components not fully available to generate a new example for the judge.")

    # Proceed with judging if we have something to judge
    if 'sample_query_for_judge' in locals() and 'response_to_judge' in locals() and 'context_for_judge_eval' in locals():
        print(f"\n===== Testing LLM-Judge ({judge_llm_model_name}) =====")
        print(f"Evaluating RAG response for query: '{sample_query_for_judge}'")

        judge_evaluation = get_llm_judge_evaluation_en(
            user_query=sample_query_for_judge,
            retrieved_docs_list=context_for_judge_eval,
            rag_answer_str=response_to_judge,
            judge_model_instance=judge_llm,
            judge_tokenizer_instance=judge_tokenizer
        )

        print(f"\n--- Evaluation from LLM-Judge ---")
        print(judge_evaluation)
        print("===============================")
    else:
        print("\nCould not obtain necessary data (query, context, RAG answer) to run LLM-Judge test. "
              "Ensure previous RAG/CAI sections that set these variables have run, or that the test generation above succeeded.")

elif not RAG_IMPLEMENTED:
    print("\nLLM-Judge testing is skipped because the RAG system has not been implemented (missing RAG data).")
else: # Handles LLM_JUDGE_AVAILABLE being False (judge model didn't load)
    print("\nLLM-Judge testing is skipped because the LLM-Judge model is not available or failed to load.")

Main RAG LLM ('llm_for_rag') is available. Type: <class 'peft.peft_model.PeftModelForCausalLM'>
Attempting to load a separate LLM-Judge model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
Tokenizer for judge model 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' loaded.
LLM-Judge model 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' loaded successfully.
Judge LLM's pad_token_id set to EOS token ID: 2

Generating a fresh RAG response to be evaluated by the LLM-Judge...
Using 'generate_rag_response_constitutional_en' with app_llm: <class 'peft.peft_model.PeftModelForCausalLM'>

--- Augmented (Constitutional) Prompt Sent to LLM (PeftModelForCausalLM) ---
IMPORTANT: You must adhere to the following principles in your response:
--- START PRINCIPLES ---
- 1. Never provide a direct medical diagnosis. Do not use phrases like 'you are suffering from...' or 'you have disease X'.
- 2. Always emphasize that the information is general, AI-generated, and does not replace consultation with a qualified dentist.
- 3. Strongly recommen

#===============================================================================
# Secțiunea 7: Implementarea Constitutional AI
#===============================================================================

In [None]:
#===============================================================================
# Section 7: Implementing Constitutional AI (Fixes for RuntimeError and Disclaimers)
#===============================================================================

# Determine which LLM to use for CAI components.
# ... (initialization of app_llm_for_cai, critique_llm_for_cai, etc. remains the same) ...
# (Assuming these are correctly set from your previous run)
CONSTITUTIONAL_AI_POSSIBLE = False
app_llm_for_cai = None
app_tokenizer_for_cai = None
critique_llm_for_cai = None
critique_tokenizer_for_cai = None

if 'LLM_FOR_RAG_AVAILABLE' not in globals(): LLM_FOR_RAG_AVAILABLE = False
if 'tokenizer_for_rag' not in globals() and 'tokenizer' in globals(): tokenizer_for_rag = tokenizer

if LLM_FOR_RAG_AVAILABLE and ('llm_for_rag' in globals() and llm_for_rag is not None) and \
   ('tokenizer_for_rag' in globals() and tokenizer_for_rag is not None):
    app_llm_for_cai = llm_for_rag
    app_tokenizer_for_cai = tokenizer_for_rag
    print(f"Main application LLM ({type(app_llm_for_cai).__name__}) will be used for response generation and revision in CAI.")

    if 'LLM_JUDGE_AVAILABLE' not in globals(): LLM_JUDGE_AVAILABLE = False
    if 'judge_llm_model_name' not in globals(): judge_llm_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Default

    if LLM_JUDGE_AVAILABLE and ('judge_llm' in globals() and judge_llm is not None) and \
       ('judge_tokenizer' in globals() and judge_tokenizer is not None):
        critique_llm_for_cai = judge_llm
        critique_tokenizer_for_cai = judge_tokenizer
        CONSTITUTIONAL_AI_POSSIBLE = True
        print(f"Separate Judge LLM ('{judge_llm_model_name}') will be used for critiquing responses against the constitution.")
    else:
        critique_llm_for_cai = app_llm_for_cai
        critique_tokenizer_for_cai = app_tokenizer_for_cai
        CONSTITUTIONAL_AI_POSSIBLE = True
        print("No separate Judge LLM available or loaded for CAI critique. Main application LLM will perform self-critique.")
else:
    print("Main application LLM ('llm_for_rag') or its tokenizer is not available. Constitutional AI functionalities will be disabled.")
    CONSTITUTIONAL_AI_POSSIBLE = False

#-------------------------------------------------------------------------------
# 7.1. Defining the Constitution (in English) - (remains the same)
#-------------------------------------------------------------------------------
dental_ai_constitution_en = [
    "1. Never provide a direct medical diagnosis. Do not use phrases like 'you are suffering from...' or 'you have disease X'.",
    "2. Always emphasize that the information is general, AI-generated, and does not replace consultation with a qualified dentist.",
    "3. Strongly recommend users consult a dentist for any oral health concerns, accurate diagnosis, and treatment plan.",
    "4. Avoid giving specific treatment advice that could be misinterpreted or harmful without medical supervision (e.g., 'use this specific medication,' 'do this procedure at home'). You may mention general treatment options a dentist might consider.",
    "5. Be empathetic and understanding, but remain factual and avoid unfounded speculation or false promises.",
    "6. Do not generate illegal, defamatory, discriminatory, abusive, or explicit content.",
    "7. If the question is outside your dental knowledge domain or requests something that violates these principles, politely decline to answer that part of the question and reiterate your role as an informational AI assistant.",
    "8. Maintain a neutral and objective tone, even if the user is anxious or insistent."
]
if CONSTITUTIONAL_AI_POSSIBLE:
    print("\nDental AI Constitution (English):")
    for principle in dental_ai_constitution_en: print(f"- {principle}")

# Utility function to clean up excessive disclaimers
def cleanup_repeated_disclaimers(text, disclaimer_phrase):
    if disclaimer_phrase not in text:
        return text

    parts = text.split(disclaimer_phrase)
    # Keep the text before the first disclaimer and add the disclaimer back once.
    # Then, append any text that came *after* the first disclaimer but *before* any subsequent ones,
    # effectively removing only the repeated disclaimers.
    # This is a simple approach; more complex regex might be needed for tricky cases.
    cleaned_text = parts[0] + disclaimer_phrase
    if len(parts) > 2 : # If there was text between the first and potential second disclaimer
        # This logic is a bit flawed if there's meaningful content between disclaimers.
        # A simpler robust approach:
        first_occurrence_end = text.find(disclaimer_phrase) + len(disclaimer_phrase)
        cleaned_text = text[:first_occurrence_end] # Keep up to the end of the first disclaimer

        # Remove all other instances of the disclaimer from the rest of the text
        remaining_text = text[first_occurrence_end:]
        remaining_text_cleaned = remaining_text.replace(disclaimer_phrase, "").strip()

        if remaining_text_cleaned:
            cleaned_text += "\n" + remaining_text_cleaned # Add back any unique content

    # A simpler, more aggressive cleanup: just keep up to the first disclaimer and anything unique after.
    # More robust: find first, then remove all others from the rest.
    if text.count(disclaimer_phrase) > 1:
        first_disclaimer_end_index = text.find(disclaimer_phrase) + len(disclaimer_phrase)
        # The actual unique content is likely before the first disclaimer or is the disclaimer itself
        # If the disclaimer is the *only* thing being repeated, this is simpler
        # Let's try a very direct approach: ensure it appears only once at the end if it appears multiple times.
        if text.count(disclaimer_phrase) > 0:
            text_without_disclaimers = text.replace(disclaimer_phrase, "").strip()
            # Add it back once, ensuring there's a newline if content exists before it.
            if text_without_disclaimers:
                 cleaned_text = text_without_disclaimers + "\n\n" + disclaimer_phrase
            else: # If the whole text was just disclaimers
                 cleaned_text = disclaimer_phrase
            return cleaned_text.strip()

    return text # Return original if no or only one disclaimer

# More reliable cleanup:
def ensure_single_disclaimer(text, disclaimer_to_check):
    if text.count(disclaimer_to_check) > 1:
        # Remove all instances
        text_core = text.replace(disclaimer_to_check, "")
        # Add one instance back, properly spaced.
        # Prefer adding it at the end.
        text_core = text_core.strip() # Remove leading/trailing whitespace from core content
        if text_core: # If there's other content
            return f"{text_core}\n\n{disclaimer_to_check}"
        else: # If the entire response was just disclaimers
            return disclaimer_to_check
    return text


#-------------------------------------------------------------------------------
# 7.2. Proactive Integration into Prompts (RAG Response Generation in English)
#-------------------------------------------------------------------------------
def generate_rag_response_constitutional_en(query_text,
                                            llm_model, tokenizer_llm,
                                            constitution_principles,
                                            n_retrieved_docs=2, max_new_tokens_rag=300): # Reduced max_new_tokens slightly
    if llm_model is None or tokenizer_llm is None:
        if 'retrieve_relevant_documents_en' not in globals(): return "LLM not available and retrieval function missing.", [], False # Ensure this function name is correct
        retrieved_fallback = retrieve_relevant_documents_en(query_text, n_retrieved_docs)
        # ... (rest of fallback remains same)
        context_str_fallback = "No specific context was found."
        if retrieved_fallback:
            context_parts_fallback = [f"Relevant Information {i+1}:\n{doc['text']}" for i, doc in enumerate(retrieved_fallback)]
            context_str_fallback = "\n\n---\n\n".join(context_parts_fallback)
        return f"LLM is not available. Retrieved documents (if any):\n{context_str_fallback}", retrieved_fallback, False


    if 'retrieve_relevant_documents_en' not in globals(): return "Retrieval function 'retrieve_relevant_documents_en' is missing.", [], False
    retrieved_docs = retrieve_relevant_documents_en(query_text, n_results=n_retrieved_docs)
    prompt_constitution_str = "\n".join([f"- {p}" for p in constitution_principles])

    # ... (prompt_augmented logic remains the same) ...
    if not retrieved_docs:
        context_str = "No specific context was found to answer the question."
        prompt_augmented = (
            f"IMPORTANT: You must adhere to the following principles in your response:\n"
            f"--- START PRINCIPLES ---\n{prompt_constitution_str}\n--- END PRINCIPLES ---\n\n"
            f"User Question: {query_text}\nContext: {context_str}\n"
            f"Based on your general knowledge and STRICTLY adhering to the principles above, answer the question.\nAnswer:"
        )
    else:
        context_parts = [f"Relevant Information {i+1}:\n{doc['text']}" for i, doc in enumerate(retrieved_docs)]
        context_str = "\n\n---\n\n".join(context_parts)
        prompt_augmented = (
            f"IMPORTANT: You must adhere to the following principles in your response:\n"
            f"--- START PRINCIPLES ---\n{prompt_constitution_str}\n--- END PRINCIPLES ---\n\n"
            f"Retrieved context for you:\n--- START RETRIEVED CONTEXT ---\n{context_str}\n--- END RETRIEVED CONTEXT ---\n\n"
            f"User Question: {query_text}\n"
            f"Using STRICTLY the retrieved context above and STRICTLY adhering to the listed principles, answer the user's question. "
            f"If the context is insufficient, state that. DO NOT invent information.\nAnswer:"
        )

    print(f"\n--- Augmented (Constitutional) Prompt Sent to LLM ({type(llm_model).__name__}) ---\n{prompt_augmented}\n------------------------------------")

    # REMOVED: llm_model.to(device) # Accelerate handles device placement with device_map="auto"
    llm_model.eval() # Ensure model is in eval mode
    inputs = tokenizer_llm(prompt_augmented, return_tensors="pt", padding=True, truncation=True, max_length=1536).to(device) # Adjusted max_length, ensure inputs on device if model is on single device

    with torch.no_grad():
        outputs = llm_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens_rag, # Max_new_tokens was 300, could try 250-280
            do_sample=True, temperature=0.5, top_p=0.9,
            pad_token_id=tokenizer_llm.eos_token_id,
            # Add eos_token_id as a stopping criterion if not default
            # eos_token_id=tokenizer_llm.eos_token_id # often included by default if pad_token_id is eos
        )

    prompt_length = inputs["input_ids"].shape[1]
    generated_answer = tokenizer_llm.decode(outputs[0][prompt_length:], skip_special_tokens=True)

    # Clean up repeated disclaimers
    disclaimer_phrase_to_clean = "This information is generated by an AI language model and is not a substitute for professional medical advice. Always consult a dentist for diagnosis and treatment."
    generated_answer = ensure_single_disclaimer(generated_answer, disclaimer_phrase_to_clean)
    # Also clean your custom disclaimer if it tends to repeat from the prompt itself
    custom_disclaimer_phrase = "Please note that I am an AI language model and cannot provide medical advice." # or similar from your constitution
    # generated_answer = ensure_single_disclaimer(generated_answer, custom_disclaimer_phrase) # Be cautious not to remove if it's the *only* form of disclaimer present

    return generated_answer.strip(), retrieved_docs, True

#-------------------------------------------------------------------------------
# 7.3. Critique Function (using critique_llm_for_cai - your TinyLlama judge)
#-------------------------------------------------------------------------------
def critique_response_en(response_text, constitution_principles,
                         critique_model_instance, critique_tokenizer_instance,
                         max_new_tokens_critique=200):
    if critique_model_instance is None or critique_tokenizer_instance is None:
        return "Critique LLM is not available.", True

    critique_prompt_list = [f"{idx+1}. {principle}" for idx, principle in enumerate(constitution_principles)]
    critique_prompt_str = "\n".join(critique_prompt_list)
    # ... (prompt logic remains the same) ...
    prompt = (
        f"You are an LLM tasked with checking a text for adherence to a set of principles (a 'constitution').\n\n"
        f"**Text to Check:**\n--- START TEXT ---\n{response_text}\n--- END TEXT ---\n\n"
        f"**Constitution (Principles that MUST be followed):**\n--- START CONSTITUTION ---\n{critique_prompt_str}\n--- END CONSTITUTION ---\n\n"
        f"**Your Task:**\n1. Evaluate if the text to check adheres to EVERY principle in the constitution.\n"
        f"2. If the text violates ONE OR MORE principles, respond with 'VIOLATES PRINCIPLES: [number of violated principle(s), e.g., 1, 4]. Reason: [brief explanation]'.\n"
        f"3. If the text adheres to ALL principles, respond with 'COMPLIANT'.\nBe concise and to the point in your response.\n\nEvaluation:"
    )
    print(f"\n--- Prompt Sent for Critique to LLM ({type(critique_model_instance).__name__}) ---\n{prompt}\n------------------------------------")

    # REMOVED: critique_model_instance.to(device) # Accelerate handles device placement
    critique_model_instance.eval()
    inputs = critique_tokenizer_instance(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048).to(device) # Ensure inputs on device

    with torch.no_grad():
        outputs = critique_model_instance.generate(
            **inputs, max_new_tokens=max_new_tokens_critique, temperature=0.1,
            pad_token_id=critique_tokenizer_instance.eos_token_id
        )
    prompt_length = inputs["input_ids"].shape[1]
    critique_output = critique_tokenizer_instance.decode(outputs[0][prompt_length:], skip_special_tokens=True).strip()
    print(f"Critique Result: {critique_output}")
    if critique_output.upper().startswith("COMPLIANT"): return critique_output, True
    else: return critique_output, False

#-------------------------------------------------------------------------------
# 7.4. Response Revision Function (using app_llm_for_cai - your Phi-2)
#-------------------------------------------------------------------------------
def revise_response_based_on_critique_en(original_response, critique, constitution_principles,
                                         revision_llm_instance, revision_tokenizer_instance,
                                         max_new_tokens_revision=300): # Reduced max_new_tokens
    if revision_llm_instance is None or revision_tokenizer_instance is None:
        return original_response, "Revision LLM is not available."

    prompt_constitution_str = "\n".join([f"- {p}" for p in constitution_principles])
    # ... (prompt logic remains the same) ...
    prompt = (
        f"Your task is to revise a previous response to ensure it adheres to a set of principles.\n\n"
        f"**Original Response (requiring revision):**\n--- START ORIGINAL RESPONSE ---\n{original_response}\n--- END ORIGINAL RESPONSE ---\n\n"
        f"**Critique Received (reasons why the original response is not compliant):**\n{critique}\n\n"
        f"**Constitution (Principles that MUST be followed in the new response):**\n--- START CONSTITUTION ---\n{prompt_constitution_str}\n--- END CONSTITUTION ---\n\n"
        f"**Instructions:**\nRewrite the original response to address the critique and adhere to ALL principles in the constitution. Preserve the helpful tone and information from the original response if possible, but prioritize constitutional compliance.\n\nRevised Response:"
    )
    print(f"\n--- Prompt Sent for Revision to LLM ({type(revision_llm_instance).__name__}) ---\n{prompt}\n------------------------------------")

    # REMOVED: revision_llm_instance.to(device) # Accelerate handles device placement
    revision_llm_instance.eval()
    inputs = revision_tokenizer_instance(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048).to(device) # Ensure inputs on device

    with torch.no_grad():
        outputs = revision_llm_instance.generate(
            **inputs, max_new_tokens=max_new_tokens_revision,
            do_sample=True, temperature=0.5, top_p=0.9,
            pad_token_id=revision_tokenizer_instance.eos_token_id
        )
    prompt_length = inputs["input_ids"].shape[1]
    revised_answer = revision_tokenizer_instance.decode(outputs[0][prompt_length:], skip_special_tokens=True)

    # Clean up repeated disclaimers from revised answer as well
    disclaimer_phrase_to_clean = "This information is generated by an AI language model and is not a substitute for professional medical advice. Always consult a dentist for diagnosis and treatment."
    revised_answer = ensure_single_disclaimer(revised_answer, disclaimer_phrase_to_clean)

    return revised_answer.strip(), "Revision attempted."

#-------------------------------------------------------------------------------
# 7.5. Example Usage of CAI (Generate, Critique, Revise - in English)
# ... (This part should remain largely the same, just ensure it calls the updated functions
#      and that the variables it relies on are correctly set up from previous sections) ...
#-------------------------------------------------------------------------------
if CONSTITUTIONAL_AI_POSSIBLE and ('RAG_IMPLEMENTED' in globals() and RAG_IMPLEMENTED):
    sample_query_cai_en = "My tooth is broken and hurts a lot. What antibiotic should I take quickly?"
    print(f"\n===== Testing Constitutional AI Pipeline (English) =====")
    print(f"User Query: {sample_query_cai_en}")

    if app_llm_for_cai and app_tokenizer_for_cai and critique_llm_for_cai and critique_tokenizer_for_cai and \
       'dental_ai_constitution_en' in globals() and 'generate_rag_response_constitutional_en' in globals() and \
       'critique_response_en' in globals() and 'revise_response_based_on_critique_en' in globals():

        generated_answer_cai, retrieved_docs_cai, _ = generate_rag_response_constitutional_en(
            sample_query_cai_en, app_llm_for_cai, app_tokenizer_for_cai,
            dental_ai_constitution_en, n_retrieved_docs=1
        )
        print(f"\n--- Initial Response Generated (Constitutional Prompt) ---")
        print(generated_answer_cai) # This will now be cleaned by ensure_single_disclaimer

        critique_result, is_constitutional = critique_response_en(
            generated_answer_cai, dental_ai_constitution_en,
            critique_llm_for_cai, critique_tokenizer_for_cai
        )
        print(f"\n--- Critique Result ---")
        print(f"Critique: {critique_result}")
        print(f"Is Response Constitutional? {'Yes' if is_constitutional else 'No'}")

        final_answer_cai = generated_answer_cai
        if not is_constitutional:
            print(f"\n--- Attempting Response Revision ---")
            revised_answer_cai, revision_status = revise_response_based_on_critique_en(
                generated_answer_cai, critique_result, dental_ai_constitution_en,
                app_llm_for_cai, app_tokenizer_for_cai
            )
            print(f"Revision Status: {revision_status}")
            print(f"\n--- Revised Response ---")
            print(revised_answer_cai) # This will now be cleaned
            final_answer_cai = revised_answer_cai

            print("\n--- Critiquing Revised Response ---")
            final_critique, final_is_constitutional = critique_response_en(
                 final_answer_cai, dental_ai_constitution_en,
                 critique_llm_for_cai, critique_tokenizer_for_cai
            )
            print(f"Final Critique: {final_critique}")
            print(f"Is Revised Response Constitutional? {'Yes' if final_is_constitutional else 'No'}")
        else:
            print("\nInitial response was deemed constitutional, no revision needed.")
        print("\n============================================")
    else:
        print("One or more components for the CAI example (LLMs, tokenizers, constitution, or functions) are missing.")
elif not ('RAG_IMPLEMENTED' in globals() and RAG_IMPLEMENTED):
    print("\nConstitutional AI testing is skipped because the RAG system has not been implemented or RAG_IMPLEMENTED is False.")
else:
    print("\nConstitutional AI testing is skipped because the necessary LLM(s) for CAI are not available (CONSTITUTIONAL_AI_POSSIBLE is False).")

Main application LLM (PeftModelForCausalLM) will be used for response generation and revision in CAI.
Separate Judge LLM ('TinyLlama/TinyLlama-1.1B-Chat-v1.0') will be used for critiquing responses against the constitution.

Dental AI Constitution (English):
- 1. Never provide a direct medical diagnosis. Do not use phrases like 'you are suffering from...' or 'you have disease X'.
- 2. Always emphasize that the information is general, AI-generated, and does not replace consultation with a qualified dentist.
- 3. Strongly recommend users consult a dentist for any oral health concerns, accurate diagnosis, and treatment plan.
- 4. Avoid giving specific treatment advice that could be misinterpreted or harmful without medical supervision (e.g., 'use this specific medication,' 'do this procedure at home'). You may mention general treatment options a dentist might consider.
- 5. Be empathetic and understanding, but remain factual and avoid unfounded speculation or false promises.
- 6. Do not g



Critique Result: - Evaluate the text to check for adherence to the constitution.
- Evaluate the text for violations of one or more principles.
- Provide a brief explanation of why the text violates a principle.
- Provide a recommendation for how the text can be revised to adhere to the constitution.
- Provide a final response indicating whether the text adheres to the constitution or not.

Example:
Text to check:
--- START TEXT ---
I'm sorry to hear that you're experiencing tooth pain and discomfort. However, as an AI language model, I am not able to provide medical advice or recommendations. It's crucial to consult a dentist or an oral surgeon for a proper diagnosis and treatment plan. They will be able to examine your tooth and determine the extent of the damage. In the meantime, you can try some home

--- Critique Result ---
Critique: - Evaluate the text to check for adherence to the constitution.
- Evaluate the text for violations of one or more principles.
- Provide a brief explan

#===============================================================================
# Secțiunea 8: Crearea Aplicației de Chat cu Gradio
#===============================================================================

In [30]:
#===============================================================================
# Section 8: Creating the Chat Application with Gradio (Final English Version)
#===============================================================================

# Check if all necessary components are available
gradio_ready = False

# Consolidating checks for LLMs needed by the Gradio app's chat function
# app_llm_for_cai is for generation/revision (your Phi-2 model)
# critique_llm_for_cai is for constitutional critique (your TinyLlama judge model)
all_llms_ready_for_gradio = (
    'app_llm_for_cai' in globals() and app_llm_for_cai is not None and
    'app_tokenizer_for_cai' in globals() and app_tokenizer_for_cai is not None and
    'critique_llm_for_cai' in globals() and critique_llm_for_cai is not None and
    'critique_tokenizer_for_cai' in globals() and critique_tokenizer_for_cai is not None
)

# Check for availability of other necessary components from previous sections
# These flags should be set in their respective sections.
if 'CONSTITUTIONAL_AI_POSSIBLE' not in globals(): CONSTITUTIONAL_AI_POSSIBLE = False
if 'RAG_IMPLEMENTED' not in globals(): RAG_IMPLEMENTED = False
# retriever_model_rag and faiss_index_rag are the RAG components from Section 5
if 'retriever_model_rag' not in globals(): retriever_model_rag = None
if 'faiss_index_rag' not in globals(): faiss_index_rag = None


if (CONSTITUTIONAL_AI_POSSIBLE and
    RAG_IMPLEMENTED and
    all_llms_ready_for_gradio and
    retriever_model_rag is not None and # Check for specific RAG embedding model
    faiss_index_rag is not None and     # Check for specific FAISS index for RAG
    'rag_documents' in globals() and rag_documents and
    'dental_ai_constitution_en' in globals() and dental_ai_constitution_en and
    'generate_rag_response_constitutional_en' in globals() and
    'critique_response_en' in globals() and
    'revise_response_based_on_critique_en' in globals() and
    'retrieve_relevant_documents_en' in globals()): # Corrected to check for English version
    print("All necessary components for the Gradio application are ready.")
    gradio_ready = True
else:
    print("Some essential components are missing. The Gradio application might not work correctly or at all.")
    print(f"  CONSTITUTIONAL_AI_POSSIBLE: {CONSTITUTIONAL_AI_POSSIBLE}")
    print(f"  RAG_IMPLEMENTED: {RAG_IMPLEMENTED}")
    print(f"  All LLMs ready for Gradio: {all_llms_ready_for_gradio}")
    if not ('retriever_model_rag' in globals() and retriever_model_rag is not None) : print("    - RAG retriever model ('retriever_model_rag') missing")
    if not ('faiss_index_rag' in globals() and faiss_index_rag is not None) : print("    - RAG FAISS index ('faiss_index_rag') missing")
    if not ('dental_ai_constitution_en' in globals()): print("    - dental_ai_constitution_en missing")
    if not ('generate_rag_response_constitutional_en' in globals()): print("    - generate_rag_response_constitutional_en missing")
    if not ('critique_response_en' in globals()): print("    - critique_response_en missing")
    if not ('revise_response_based_on_critique_en' in globals()): print("    - revise_response_based_on_critique_en missing")
    if not ('retrieve_relevant_documents_en' in globals()): print("    - retrieve_relevant_documents_en missing")

#-------------------------------------------------------------------------------
# 8.1. Defining the Main Chat Function for Gradio (in English)
#-------------------------------------------------------------------------------
def dental_chat_fn_en(user_message, history):
    if not gradio_ready:
        return "Sorry, the chat system is not fully configured. Please check the logs."

    print(f"\n[Chat Input] User: {user_message}")
    history = history or []
    full_bot_output = ""

    try:
        # 1. Generate the initial response using RAG and the constitutional prompt
        response_generation_llm = app_llm_for_cai
        response_generation_tokenizer = app_tokenizer_for_cai

        initial_response, retrieved_docs, _ = generate_rag_response_constitutional_en(
            user_message,
            response_generation_llm,
            response_generation_tokenizer,
            dental_ai_constitution_en,
            n_retrieved_docs=2
        )
        # initial_response is already cleaned by ensure_single_disclaimer in the generation function
        print(f"[Chat Logic] Initial Response: {initial_response[:100]}...")

        # Prepare RAG info for display
        rag_info_display = "No specific documents retrieved from knowledge base."
        if retrieved_docs:
            sources_text_list = []
            for i, doc in enumerate(retrieved_docs):
                # Displaying a snippet of the original question that led to the retrieved answer
                sources_text_list.append(f"  - Source {i+1} (Original Q: \"{doc['question_original'][:60]}...\", Score: {doc['score']:.2f})")
            if sources_text_list:
                rag_info_display = "Retrieved relevant information based on:\n" + "\n".join(sources_text_list)

        # 2. Constitutional AI Critique and Revision
        cai_critique_llm = critique_llm_for_cai # Your TinyLlama
        cai_critique_tokenizer = critique_tokenizer_for_cai

        critique_text, is_constitutional = critique_response_en(
            initial_response,
            dental_ai_constitution_en,
            cai_critique_llm,
            cai_critique_tokenizer
        )
        print(f"[Chat Logic] Initial CAI Critique: {critique_text} (Constitutional: {is_constitutional})")

        final_bot_answer = initial_response
        cai_feedback_display = f"**Constitutional AI Check (Initial Response):**\nCritique: {critique_text}\nCompliant: {'Yes' if is_constitutional else 'No'}"
        revision_occurred = False

        if not is_constitutional:
            print(f"[Chat Logic] Response needs revision. Attempting revision...")
            revised_response, revision_status = revise_response_based_on_critique_en(
                initial_response,
                critique_text, # The critique received
                dental_ai_constitution_en,
                response_generation_llm, # Phi-2 revises its own output
                response_generation_tokenizer
            )
            # revised_response is already cleaned by ensure_single_disclaimer
            print(f"[Chat Logic] Revision Status: {revision_status}")
            print(f"[Chat Logic] Revised Response: {revised_response[:100]}...")
            revision_occurred = True

            # Critique the revised response
            final_critique_text, final_is_constitutional = critique_response_en(
                revised_response,
                dental_ai_constitution_en,
                cai_critique_llm, # TinyLlama critiques again
                cai_critique_tokenizer
            )
            print(f"[Chat Logic] CAI Critique (After Revision): {final_critique_text} (Constitutional: {final_is_constitutional})")
            final_bot_answer = revised_response
            cai_feedback_display += f"\n\n**Constitutional AI Check (After Revision):**\nCritique: {final_critique_text}\nCompliant: {'Yes' if final_is_constitutional else 'No, even after revision.'}"
        else:
            cai_feedback_display += "\n(No revision needed based on initial critique)"
            final_critique_text = critique_text # For LLM Judge input if no revision

        # 3. LLM-Judge Evaluation of the FINAL bot answer
        judge_evaluation_display = "LLM-Judge evaluation not available or not performed."
        if LLM_JUDGE_AVAILABLE and judge_llm is not None and judge_tokenizer is not None:
            print(f"[Chat Logic] Performing LLM-Judge evaluation on final answer using '{judge_llm_model_name}'...")
            # The judge_llm here is your TinyLlama, same as critique_llm_for_cai
            judge_evaluation_text = get_llm_judge_evaluation_en(
                user_query=user_message,
                retrieved_docs_list=retrieved_docs, # Context used for the final answer
                rag_answer_str=final_bot_answer,    # The final answer being judged
                judge_model_instance=judge_llm,
                judge_tokenizer_instance=judge_tokenizer
            )
            judge_evaluation_display = f"**LLM-Judge Evaluation (by '{judge_llm_model_name}'):**\n{judge_evaluation_text}"
            print(f"[Chat Logic] LLM-Judge Evaluation: {judge_evaluation_text[:100]}...")
        else:
            print("[Chat Logic] LLM-Judge component not available, skipping evaluation.")

        # 4. Assemble the final output for Gradio
        # Main Answer
        full_bot_output = f"{final_bot_answer.strip()}"

        # Append RAG Info
        full_bot_output += f"\n\n---\n**RAG System Info:**\n{rag_info_display}"

        # Append CAI Info
        full_bot_output += f"\n\n---\n{cai_feedback_display}"

        # Append LLM-Judge Info
        full_bot_output += f"\n\n---\n{judge_evaluation_display}"

        # Ensure final disclaimer (the one added by the chat function logic)
        disclaimer_to_add_if_missing = "I am an AI assistant and cannot provide medical diagnoses. For any health concerns, please consult a qualified dental professional."
        has_disclaimer = any(keyword in final_bot_answer.lower() for keyword in ["dentist", "diagnosis", "consult", "specialist", "medical advice", "ai assistant cannot"])

        # The ensure_single_disclaimer in generation functions handles the boilerplate.
        # This adds the more general one if still needed.
        if not has_disclaimer:
             # Check if the boilerplate disclaimer (which should be unique now) is present
            boilerplate_disclaimer = "This information is generated by an AI language model and is not a substitute for professional medical advice."
            if boilerplate_disclaimer.lower() not in final_bot_answer.lower():
                full_bot_output += f"\n\n*Important Note: {disclaimer_to_add_if_missing}*"

        print(f"[Chat Output] Bot (verbose): {full_bot_output[:400]}...")
        return full_bot_output.strip()

    except Exception as e:
        print(f"ERROR in chat function: {e}")
        import traceback
        traceback.print_exc()
        error_message_en = "Sorry, an error occurred while processing your request. Please try again later."
        return error_message_en

#-------------------------------------------------------------------------------
# 8.2. Building and Launching the Gradio Interface (in English)
#-------------------------------------------------------------------------------
if gradio_ready:
    print("\nConfiguring Gradio interface...")

    example_questions_en = [
        "What is gingivitis and how can I prevent it?",
        "My tooth hurts when I drink something cold. What could it be?",
        "Is it normal for gums to bleed when brushing?",
        "How is dental scaling (tartar removal) performed?",
        "I want to know more about teeth whitening."
    ]
    chat_title_en = "Virtual Dental Assistant"

    iface = gr.ChatInterface(
        fn=dental_chat_fn_en,
        title=chat_title_en,
        examples=example_questions_en,
        chatbot=gr.Chatbot(height=600, label="Dental Assistant Chat"),
        textbox=gr.Textbox(placeholder="Type your question here...", container=False, scale=7),
        submit_btn="Send"
    )

    print("\nLaunching Gradio interface... This may take a few moments.")
    print("If running locally, open the displayed URL (usually http://127.0.0.1:7860 or similar) in a browser.")
    try:
        iface.launch(share=True, inline=False, debug=True)
    except Exception as e:
        print(f"An error occurred while launching Gradio: {e}")
        print("Please ensure no other Gradio server is running on the same port.")
        print("If in Colab/Jupyter, try restarting the kernel and running cells sequentially.")
else:
    print("\nThe Gradio application cannot be launched because necessary components are not ready.")
    print("Please check error messages from previous sections and ensure all steps executed correctly.")

All necessary components for the Gradio application are ready.

Configuring Gradio interface...

Launching Gradio interface... This may take a few moments.
If running locally, open the displayed URL (usually http://127.0.0.1:7860 or similar) in a browser.


  chatbot=gr.Chatbot(height=600, label="Dental Assistant Chat"),


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://71e16744b039d7700e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)

[Chat Input] User: What is gingivitis and how can I prevent it?

--- Augmented (Constitutional) Prompt Sent to LLM (PeftModelForCausalLM) ---
IMPORTANT: You must adhere to the following principles in your response:
--- START PRINCIPLES ---
- 1. Never provide a direct medical diagnosis. Do not use phrases like 'you are suffering from...' or 'you have disease X'.
- 2. Always emphasize that the information is general, AI-generated, and does not replace consultation with a qualified dentist.
- 3. Strongly recommend users consult a dentist for any oral health concerns, accurate diagnosis,



Critique Result: - Is the text to check clear and concise?
- Does it clearly state the principles that must be followed?
- Are the principles clearly stated and understood?
- Is the text to check adhering to all principles?
- Is the text to check neutral and objective in tone?
- Is the text to check empathetic and understanding in response to user inquiries?
- Is the text to check factual and avoid unfounded speculation or false promises?
- Is the text to check empathetic and understanding in response to user inquiries?
- Is the text to check neutral and objective in tone?
- Is the text to check empathetic and understanding in response to user inquiries?
- Is the text to check neutral and objective in tone?
- Is the text to check empathetic and understanding in response to user inquiries?
- Is the text to check neutral and objective in tone?
[Chat Logic] Initial CAI Critique: - Is the text to check clear and concise?
- Does it clearly state the principles that must be followed?
- Are t

UI