In [None]:
%pip install torch

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



ERROR: Invalid requirement: 'torch,'

[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
%pip install sacremoses

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
   ---------------------------------------- 0.0/897.5 kB ? eta -:--:--
   ---------------------------------------- 10.2/897.5 kB ? eta -:--:--
   -- ------------------------------------ 61.4/897.5 kB 825.8 kB/s eta 0:00:02
   ---------- ----------------------------- 235.5/897.5 kB 1.8 MB/s eta 0:00:01
   ---------------------------- ----------- 645.1/897.5 kB 3.7 MB/s eta 0:00:01
   ---------------------------------------- 897.5/897.5 kB 4.7 MB/s eta 0:00:00
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [11]:
import os
import json
import numpy as np
from typing import List, Dict, Any, Optional

import torch
import faiss

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from sentence_transformers import SentenceTransformer

In [12]:
###############################################################################
# 1. Data Loading & Chunking
###############################################################################
def load_qa_dataset(dataset_path: str) -> List[Dict[str, str]]:
    """
    Load a JSONL file where each line is an object:
      { "question": "text", "answer": "text" }
    Returns a list of dicts [{"question": ..., "answer": ...}, ...].
    """
    qa_pairs = []
    with open(dataset_path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line.strip())
            qa_pairs.append({
                "question": data.get("question", ""),
                "answer": data.get("answer", "")
            })
    return qa_pairs

def chunk_qa_pairs(qa_data: List[Dict[str, str]]) -> List[Dict[str, str]]:
    """
    Currently, we treat each QA pair as a single chunk.
    More sophisticated chunking logic could be implemented.
    """
    return qa_data

In [13]:
###############################################################################
# 2. Vector Indexing with FAISS
###############################################################################
def build_faiss_index(
    chunks: List[Dict[str, str]],
    embedder: SentenceTransformer,
    index_path: str = "faiss_index.index",
    metadata_path: str = "faiss_index_meta.json"
):
    """
    Creates FAISS index from chunk questions (or Q+A combined) using
    a SentenceTransformer model to generate embeddings.
    Saves the FAISS index and associated metadata as JSON.
    """
    # We'll embed the question or you could embed "question + answer" if you prefer
    texts_to_embed = [c["question"] for c in chunks]
    print(f"Generating embeddings for {len(texts_to_embed)} chunks...")
    embeddings = embedder.encode(texts_to_embed, convert_to_numpy=True, show_progress_bar=True)
    
    # Create FAISS index (L2 distance)
    embedding_dim = embeddings.shape[1]
    print(f"Embedding dimension: {embedding_dim}")
    index = faiss.IndexFlatL2(embedding_dim)
    index.add(embeddings.astype(np.float32))

    # Save index
    faiss.write_index(index, index_path)
    
    # Save metadata
    with open(metadata_path, "w", encoding="utf-8") as f:
        json.dump(chunks, f, ensure_ascii=False, indent=2)

    print(f"FAISS index built and saved to {index_path} and {metadata_path}.")


def load_faiss_index(
    index_path: str = "faiss_index.index",
    metadata_path: str = "faiss_index_meta.json"
):
    """
    Loads a FAISS index and its associated metadata.
    """
    index = faiss.read_index(index_path)
    with open(metadata_path, "r", encoding="utf-8") as f:
        metadata = json.load(f)
    print("FAISS index and metadata loaded.")
    return index, metadata


def faiss_retrieve(
    query: str,
    index,
    metadata: List[Dict[str, Any]],
    embedder: SentenceTransformer,
    top_k: int = 3
) -> List[Dict[str, str]]:
    """
    Retrieves the top_k chunks from the FAISS index given a user query
    by embedding the query text and performing similarity search.
    """
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    query_embedding = query_embedding.astype(np.float32)
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for i in range(top_k):
        chunk_id = indices[0][i]
        if 0 <= chunk_id < len(metadata):
            results.append(metadata[chunk_id])
    return results

In [40]:
###############################################################################
# 3. Fine-Tuning BioGPT
###############################################################################
class QADataset(torch.utils.data.Dataset):
    """
    A basic dataset for CausalLM training where each example is:
      "Question: {question}\nAnswer: {answer}"
    We will train in a language-modeling style.
    """
    def __init__(self, tokenizer, qa_pairs, max_length=256):
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.texts = [
            f"Question: {qa['question']}\nAnswer: {qa['answer']}"
            for qa in qa_pairs
        ]
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        return text


def train_base_model(
    train_data: List[Dict[str, str]],
    model_name_or_path: str = "microsoft/biogpt",
    output_dir: str = "./finetuned_biogpt",
    num_train_epochs: int = 3,
    batch_size: int = 2
):
    """
    Fine-tunes the BioGPT model in a causal language modeling manner on QA pairs,
    and saves both model weights and tokenizer to output_dir.
    """
    # Load model + tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    tokenizer.padding_side = "left"
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(model_name_or_path)

    # Create dataset
    dataset = QADataset(tokenizer, train_data)

    def collate_fn(batch_texts):
        encoded = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        )
        encoded["labels"] = encoded["input_ids"].clone()
        return encoded

    training_args = TrainingArguments(
        output_dir=output_dir,
        max_steps=50,
        overwrite_output_dir=True,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=batch_size,
        save_steps=10,
        save_total_limit=1,
        logging_steps=5,
        evaluation_strategy="no",
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=collate_fn,
    )

    # Train and then save both model & tokenizer
    if len(train_data) > 0:
        trainer.train()
        os.makedirs(output_dir, exist_ok=True)
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        print(f"✅ Base model fine-tuned and saved to {output_dir}.")
    else:
        print("⚠️ No training data provided; skipping training.")

    return tokenizer, model

In [41]:
###############################################################################
# 4. RLHF Step (Placeholder)
###############################################################################
def perform_rlhf_training(
    base_model,
    feedback_data: List[Dict[str, Any]]
):
    """
    Placeholder for an RLHF procedure.
    In practice:
      - Train a reward model using feedback_data
      - Use PPO or other RL method to adjust base_model
    """
    if len(feedback_data) == 0:
        print("No feedback data provided. Skipping RLHF.")
        return base_model

    # Pseudocode:
    # 1. Train a reward model
    # 2. Run PPO with reward model
    # ...
    print("RLHF training step (placeholder). Returning base model unchanged.")
    return base_model

In [47]:
###############################################################################
# 5. Reranking & 6. Retrieval-Augmented Generation (RAG) + Safety Filter
###############################################################################
def rerank_chunks(chunks: List[Dict[str, str]], query: str) -> List[Dict[str, str]]:
    """
    Trivial reranking example. You might use a cross-encoder or other approach.
    Here, we'll just sort by length of 'answer' descending for demonstration.
    """
    sorted_chunks = sorted(chunks, key=lambda x: len(x["answer"]), reverse=True)
    return sorted_chunks


def rag_engine(
    query: str,
    chunks: List[Dict[str, str]],
    model,
    tokenizer,
    max_length: int = 256
) -> str:
    """
    Improved RAG approach:
      - Take top chunk (after reranking) for context
      - Prompt the model cleanly
      - Apply generation constraints
      - Return only the generated continuation (no prompt)
    """
    if not chunks:
        return "No relevant context found."

    top_chunk = chunks[0]
    prompt = (
        "You are a highly knowledgeable oral health expert.\n"
        "Below is a patient case excerpt. Using ONLY this information, answer the question briefly and accurately. Do NOT repeat the excerpt.\n\n"
        f"Patient excerpt:\n"
        f"  • Question: {top_chunk['question']}\n"
        f"  • Answer: {top_chunk['answer']}\n\n"
        f"User question: {query}\n"
        "Answer:"
    )

    # Tokenize prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    prompt_len = input_ids.shape[-1]

    # Generate with constraints
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_length=max_length,
            temperature=0.5,
            top_p=0.9,
            no_repeat_ngram_size=3,
            early_stopping=True,
            num_beams=2
        )

    # Slice off the prompt tokens, decode only the new tokens
    generated_ids = output_ids[0][prompt_len:]
    answer = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return answer


def toxicity_and_hallucination_filter(response: str) -> str:
    """
    Minimal placeholder. Real solutions might use a classifier or regex checks
    for harmful content or glaring hallucinations.
    """
    blocked_words = ["badword1", "badword2"]  # etc.
    lower_resp = response.lower()
    if any(bw in lower_resp for bw in blocked_words):
        return "Content removed due to policy."
    return response


###############################################################################
# 7. Response Generation & Feedback
###############################################################################
def generate_response(query: str, model, tokenizer, index, metadata, embedder) -> str:
    # 1) Retrieve top chunks via FAISS
    retrieved_chunks = faiss_retrieve(query, index, metadata, embedder, top_k=3)
    # 2) Rerank them
    reranked_chunks = rerank_chunks(retrieved_chunks, query)
    # 3) RAG
    raw_answer = rag_engine(query, reranked_chunks, model, tokenizer)
    # 4) Safety Filter
    final_answer = toxicity_and_hallucination_filter(raw_answer)
    return final_answer

def collect_feedback(user_id: str, query: str, response: str) -> Dict[str, Any]:
    """
    Store user feedback, e.g., rating or helpfulness, for future RLHF or analytics.
    """
    feedback_entry = {
        "user_id": user_id,
        "query": query,
        "response": response,
        "rating": None  # fill or override in real usage
    }
    # In real usage, you might log to a DB or logging system
    print(f"Collected feedback: {feedback_entry}")
    return feedback_entry

In [48]:
def ensure_tokenizer_in_dir(model_dir: str, base_model_name: str):
    """
    Makes sure that model_dir contains a tokenizer. 
    If not, load from base_model_name and save into model_dir.
    """
    required_files = ["tokenizer_config.json", "vocab.json", "tokenizer.json"]
    # This is a heuristic; some tokenizers use vocab.txt, merges.txt, etc.
    files = os.listdir(model_dir)
    if not any(fname in files for fname in required_files):
        print("⚠️  Tokenizer files not found in", model_dir)
        print("   Rescuing tokenizer from base model and saving into that folder...")
        base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        base_tokenizer.save_pretrained(model_dir)
    else:
        print("✅  Tokenizer files found in", model_dir)

def main():
    ###########################################################################
    # 1. Load QA Dataset
    ###########################################################################
    dataset_path = "./Open-Domain-Oral-Disease-QA-Dataset-main/extracted_all.jsonl"
    qa_data = load_qa_dataset(dataset_path)
    print(f"Loaded {len(qa_data)} QA pairs.")

    # 2. Chunking
    chunks = chunk_qa_pairs(qa_data)

    ###########################################################################
    # 3. Build (or load) FAISS Index
    ###########################################################################
    embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    index_path = "faiss_index.index"
    meta_path  = "faiss_index_meta.json"

    if not (os.path.exists(index_path) and os.path.exists(meta_path)):
        build_faiss_index(chunks, embedder, index_path, meta_path)
    else:
        print("FAISS index found; skipping rebuild.")
    index, metadata = load_faiss_index(index_path, meta_path)

    ###########################################################################
    # 4. Fine-tune or Load the base BioGPT
    ###########################################################################
    from transformers import AutoTokenizer, AutoModelForCausalLM

    model_dir       = "./finetuned_biogpt"
    base_model_name = "microsoft/biogpt"
    weights_path    = os.path.join(model_dir, "pytorch_model.bin")

    # If the fine-tuned weights aren't there, we must train (which also saves tokenizer).
    if not os.path.exists(weights_path):
        print("🔨 Fine-tuned weights not found—starting training now.")
        tokenizer, base_model = train_base_model(
            train_data=qa_data,
            model_name_or_path=base_model_name,
            output_dir=model_dir,
            num_train_epochs=3,
            batch_size=2
        )
    else:
        print("📂 Fine-tuned weights found; loading model + tokenizer…")
        # Rescue a missing tokenizer if needed
        ensure_tokenizer_in_dir(model_dir, base_model_name)

        tokenizer = AutoTokenizer.from_pretrained(model_dir)
        tokenizer.pad_token = tokenizer.eos_token
        base_model = AutoModelForCausalLM.from_pretrained(model_dir)

    ###########################################################################
    # 5. RLHF Step (Placeholder)
    ###########################################################################
    feedback_data = []
    rl_model      = perform_rlhf_training(base_model, feedback_data)

    ###########################################################################
    # 6 & 7. Demo RAG with sample query
    ###########################################################################
    user_query  = "What are common symptoms of gum inflammation?"
    final_answer = generate_response(
        query=   user_query,
        model=   rl_model,
        tokenizer=tokenizer,
        index=   index,
        metadata=metadata,
        embedder=embedder
    )

    print("\nUser Query:", user_query)
    print("System Answer:", final_answer)

    # 8. Collect feedback
    collect_feedback("test_user", user_query, final_answer)


if __name__ == "__main__":
    main()

Loaded 672 QA pairs.
FAISS index found; skipping rebuild.
FAISS index and metadata loaded.
🔨 Fine-tuned weights not found—starting training now.




Step,Training Loss
5,2.144
10,0.7949
15,0.5899
20,0.2042
25,0.1417
30,0.0916
35,0.1482
40,0.0778
45,0.0894
50,0.0649


SafetensorError: Error while serializing: IoError(Os { code: 1224, kind: Uncategorized, message: "The requested operation cannot be performed on a file with a user-mapped section open." })

In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"