In [1]:


# Uninstall bitsandbytes first to ensure a clean install, especially after previous errors.
# The -y flag confirms "yes" to any prompts during uninstallation.


# Install all necessary libraries for the RAG pipeline.
# -q makes the installation quiet.
# We ensure 'bitsandbytes' is included for LLaMA's 8-bit quantization.
# 'torch' and 'tensorflow' are included as 'chromadb' or 'transformers' might have dependencies.
!pip install -q pandas chromadb sentence-transformers transformers accelerate bitsandbytes torch tensorflow
!pip install -q evaluate rouge_score sacrebleu scikit-learn
!pip install -q gradio


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m64.8 MB/s[0m eta [36m

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from chromadb import PersistentClient
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import numpy as np
import evaluate # For BLEU, ROUGE
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm # For progress bars
import uuid # This import is crucial for generating unique IDs in ChromaDB

# --- Configuration (ENGLISH ONLY) ---
# Your single English CSV file.
# Make sure 'OPD Schedule for Department of ANAESTHESIA.csv' has been uploaded to your Colab session.
CSV_FILE_PATH = 'OPD Schedule for Department of ANAESTHESIA.csv'
CHROMA_DB_PATH = 'hospital_english_chroma_db' # Changed DB path for clarity
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2' # English-optimized embedding model
LLAMA_MODEL_NAME = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' # Using TinyLlama for accessibility.
                                                     # Replace with 'meta-llama/Llama-2-7b-chat-hf' if you have access and token setup.

In [3]:
# --- 1. Data Ingestion & Preprocessing (ENGLISH ONLY) ---
def load_and_preprocess_data(file_path): # Now accepts a single file path
    """
    Loads data from the 'OPD Schedule for Department of ANAESTHESIA.csv' file,
    and uses the 'Answer' column as the knowledge base chunks.
    """
    print("Loading and preprocessing OPD Schedule data...")
    all_chunks = []
    text_column = 'Answer' # The column to extract text from

    print(f"  - Loading data from: {file_path}")
    try:
        df = pd.read_csv(file_path)

        if text_column not in df.columns:
            print(f"    ERROR: Column '{text_column}' not found in '{file_path}'. Please ensure your CSV has an 'Answer' column.")
            return []

        chunks = df[text_column].astype(str).tolist()
        chunks = [chunk for chunk in chunks if chunk.strip()] # Remove empty chunks
        all_chunks.extend(chunks)
        print(f"    Loaded {len(chunks)} chunks from '{file_path}'.")

    except FileNotFoundError:
        print(f"    ERROR: File not found: '{file_path}'. Please ensure it is uploaded to Colab.")
        return []
    except Exception as e:
        print(f"    ERROR processing '{file_path}': {e}")
        return []

    # Remove duplicate chunks (optional, but good practice)
    all_chunks = list(set(all_chunks))
    print(f"Total unique chunks loaded: {len(all_chunks)}")
    return all_chunks

In [4]:
# --- 2. Embedding Model Initialization ---
def initialize_embedding_model(model_name):
    """Initializes the Sentence Transformer model for embeddings."""
    print(f"Initializing embedding model: {model_name}...")
    model = SentenceTransformer(model_name)
    print("Embedding model initialized.")
    return model

In [5]:
# --- 3. ChromaDB Setup and Data Storage (REVISED with UUIDs) ---
def setup_chromadb(db_path, chunks, embedding_model):
    """
    Sets up ChromaDB, embeds the chunks, and stores them.
    This revised version ensures truly unique IDs using UUIDs.
    """
    print(f"Setting up ChromaDB at: {db_path}...")
    client = PersistentClient(path=db_path)
    collection_name = "hospital_english_knowledge" # Using a new collection name for English data

    try:
        collection = client.get_or_create_collection(name=collection_name)
        print(f"ChromaDB collection '{collection_name}' ready. Current count: {collection.count()}")

        # Retrieve existing document texts to avoid adding identical content if desired.
        # However, with UUIDs, each addition will have a unique ID regardless.
        existing_doc_texts_in_db = set()
        if collection.count() > 0:
            existing_docs_data = collection.get(include=['documents'])
            existing_doc_texts_in_db = set(existing_docs_data.get('documents', []))

        new_chunks_to_add = []
        new_ids_to_add = []

        # Iterate through your input chunks.
        # Generate a new UUID for every chunk to guarantee uniqueness.
        for chunk in chunks:
            # You can uncomment the following 'if' block if you want to explicitly skip
            # adding text content that is IDENTICAL to what's already in the DB.
            # if chunk in existing_doc_texts_in_db:
            #     continue

            new_chunks_to_add.append(chunk)
            new_ids_to_add.append(str(uuid.uuid4())) # Generate a unique UUID for each chunk

        if new_chunks_to_add:
            print(f"Embedding and adding {len(new_chunks_to_add)} new chunks to ChromaDB...")
            embeddings = embedding_model.encode(new_chunks_to_add, show_progress_bar=True).tolist()
            collection.add(
                documents=new_chunks_to_add,
                embeddings=embeddings,
                ids=new_ids_to_add
            )
            print(f"Added {len(new_chunks_to_add)} chunks to ChromaDB. Total chunks: {collection.count()}")
        else:
            print("No new chunks to add, or all chunks already exist. ChromaDB is up-to-date.")

    except Exception as e:
        print(f"Error setting up ChromaDB: {e}")
        return None
    return collection

In [6]:
# --- 4. Retrieval Function ---
def retrieve_relevant_chunks(query, collection, embedding_model, n_results=3):
    """Retrieves the most relevant chunks from ChromaDB for a given query."""
    print(f"Retrieving {n_results} relevant chunks for query: '{query}'")
    query_embedding = embedding_model.encode([query]).tolist()
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=n_results,
        include=['documents']
    )
    relevant_chunks = results['documents'][0]
    print("Retrieved chunks:")
    for i, chunk in enumerate(relevant_chunks):
        print(f"  {i+1}. {chunk[:100]}...") # Print first 100 chars to show the content
    return relevant_chunks

In [7]:
# --- 5. LLaMA Model Initialization and Generation (ENGLISH ONLY) ---
def initialize_llama_model(model_name):
    """Initializes the LLaMA model and tokenizer."""
    print(f"Initializing LLaMA model: {model_name}...")
    # NOTE: If using 'meta-llama/Llama-2-7b-chat-hf', you'll likely need to:
    # 1. Accept its license on Hugging Face.
    # 2. Generate a Hugging Face API token.
    # 3. Uncomment and use the login command below:
    # from huggingface_hub import login; login(token="hf_YOUR_TOKEN_HERE") # Replace with your actual HF token

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_8bit=True, # Enables 8-bit quantization for memory efficiency on Colab's free GPU
        torch_dtype=torch.float16, # Uses float16 for further memory reduction
        device_map="auto" # Automatically maps model layers to available devices (e.g., GPU)
    )
    # Creates a text generation pipeline for easy inference
    text_generation_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=200, # Sets the maximum length for the generated answer
        temperature=0.7,    # Controls randomness; lower values make output more deterministic
        top_p=0.9,          # Nucleus sampling; considers tokens with cumulative probability up to top_p
        repetition_penalty=1.15 # Penalizes repeated words to encourage diversity
    )
    print("LLaMA model initialized.")
    return text_generation_pipeline

def generate_answer(query, relevant_chunks, text_generation_pipeline):
    """Generates an answer using the LLaMA model, grounded in retrieved chunks."""
    print("Generating answer with LLaMA...")
    # Combines retrieved chunks into a single context string
    context = "\n".join(relevant_chunks)
    # Formulates a prompt instructing the LLM to answer based *only* on the provided context
    prompt = f"""
    You are a helpful hospital assistant specializing in OPD schedules for Anaesthesia.
    Answer the following question only based on the provided context about OPD timings and consultants.
    If the answer cannot be found in the context, politely state that you don't have enough information.

    Context:
    {context}

    Question: {query}

    Answer:
    """
    # Uses the text generation pipeline to get a response
    response = text_generation_pipeline(prompt)
    generated_text = response[0]['generated_text']

    # Attempts to extract only the answer part from the generated text,
    # as the model might sometimes repeat the prompt.
    if "Answer:" in generated_text:
        answer_start_index = generated_text.find("Answer:") + len("Answer:")
        final_answer = generated_text[answer_start_index:].strip()
    else:
        final_answer = generated_text.replace(prompt.strip(), "", 1).strip()
        if final_answer.startswith("You are a helpful hospital assistant"):
             final_answer = final_answer.split("Answer:", 1)[-1].strip() if "Answer:" in final_answer else final_answer

    print("Answer generated.")
    return final_answer

In [8]:
# --- 6. Evaluation Metrics Setup ---
def setup_evaluation_metrics():
    """Sets up evaluation metrics (BLEU, ROUGE). F1 and Cosine Similarity are calculated manually."""
    print("Setting up evaluation metrics...")
    bleu = evaluate.load("bleu")
    rouge = evaluate.load("rouge")
    print("Evaluation metrics ready (BLEU, ROUGE).")
    return bleu, rouge

def evaluate_answer(generated_answer, reference_answer, embedding_model, bleu, rouge):
    """Evaluates the generated answer against a reference using various metrics."""
    print("Evaluating generated answer...")
    # Calculate BLEU score
    bleu_results = bleu.compute(predictions=[generated_answer], references=[[reference_answer]])
    # Calculate ROUGE scores
    rouge_results = rouge.compute(predictions=[generated_answer], references=[[reference_answer]])

    # --- Manual F1 Score Calculation (token-based for simplicity) ---
    generated_tokens = set(generated_answer.lower().split())
    reference_tokens = set(reference_answer.lower().split())
    common_tokens = generated_tokens.intersection(reference_tokens)

    precision = len(common_tokens) / len(generated_tokens) if len(generated_tokens) > 0 else 0
    recall = len(common_tokens) / len(reference_tokens) if len(reference_tokens) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # --- Cosine Similarity Calculation (using embeddings) ---
    # Embed both the generated and reference answers
    gen_embedding = embedding_model.encode([generated_answer])
    ref_embedding = embedding_model.encode([reference_answer])
    # Calculate cosine similarity between their embeddings
    cos_sim = cosine_similarity(gen_embedding, ref_embedding)[0][0]

    print("\n--- Evaluation Results ---")
    print(f"BLEU: {bleu_results['bleu']:.4f}")
    print(f"ROUGE-L: {rouge_results['rougeL']:.4f}") # ROUGE-L is often a key metric
    print(f"F1 Score (token-based): {f1_score:.4f}")
    print(f"Cosine Similarity: {cos_sim:.4f}")
    print("------------------------")

    return bleu_results, rouge_results, f1_score, cos_sim

In [9]:
# --- Main Pipeline Execution (ENGLISH ONLY) ---
if __name__ == "__main__":
    print(f"Attempting to load data from: {CSV_FILE_PATH}")

    # Step 1: Load and Preprocess Data
    hospital_chunks = load_and_preprocess_data(CSV_FILE_PATH)
    if not hospital_chunks:
        print("ERROR: Exiting due to data loading error or empty chunks. Please check CSV_FILE_PATH and ensure the 'Answer' column exists and has content.")
    else:
        # Step 2: Initialize Embedding Model
        embedding_model = initialize_embedding_model(EMBEDDING_MODEL_NAME)

        # Step 3: Setup ChromaDB and Store Data
        chroma_collection = setup_chromadb(CHROMA_DB_PATH, hospital_chunks, embedding_model)

        if chroma_collection:
            print("\nChromaDB setup successful. Proceeding with LLaMA initialization and query processing.")
            # --- Test Query ---
            user_query = "Who is the consultant for Pain Clinic on Wednesdays?"
            # You can change this to any question relevant to your CSV data, e.g.:
            # "What are the OPD registration timings for the Department of Anaesthesia?"
            # "When does the Anaesthesia evening clinic open?"
            # "Which doctors are available in PAC APC OPD?"
            # "Where is PAC APC OPD located?"

            # --- Reference Answer (for evaluation) ---
            # IMPORTANT: Change this to the correct answer for your 'user_query'
            reference_answer = "Dr. Babita Ghai is the consultant for Pain Clinic on Wednesdays."

            # Step 4: Retrieve Relevant Chunks
            relevant_docs = retrieve_relevant_chunks(user_query, chroma_collection, embedding_model)

            # Step 5: Initialize and Use LLaMA for Generation
            text_generation_pipeline = None # Initialize to None
            try:
                text_generation_pipeline = initialize_llama_model(LLAMA_MODEL_NAME)
                print(f"LLaMA model initialization status: {text_generation_pipeline is not None}")
            except Exception as e:
                print(f"ERROR: LLaMA model initialization failed: {e}")
                print("Please double-check `bitsandbytes` installation and ensure runtime was restarted immediately after the initial installation/upgrade.")

            if text_generation_pipeline: # Only proceed if LLaMA pipeline initialized successfully
                generated_answer = generate_answer(user_query, relevant_docs, text_generation_pipeline)
                print(f"\nGenerated Answer: {generated_answer}")

                # Step 6: Evaluation
                bleu, rouge = setup_evaluation_metrics()
                evaluate_answer(generated_answer, reference_answer, embedding_model, bleu, rouge)
            else:
                print("RAG system components (LLaMA model) not fully initialized. Cannot proceed with generation/evaluation.")
        else:
            print("ERROR: ChromaDB setup failed. Cannot proceed with retrieval and generation. Please review the `setup_chromadb` function and potential data issues.")

Attempting to load data from: OPD Schedule for Department of ANAESTHESIA.csv
Loading and preprocessing OPD Schedule data...
  - Loading data from: OPD Schedule for Department of ANAESTHESIA.csv
    Loaded 19 chunks from 'OPD Schedule for Department of ANAESTHESIA.csv'.
Total unique chunks loaded: 15
Initializing embedding model: all-MiniLM-L6-v2...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model initialized.
Setting up ChromaDB at: hospital_english_chroma_db...
ChromaDB collection 'hospital_english_knowledge' ready. Current count: 0
Embedding and adding 15 new chunks to ChromaDB...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Added 15 chunks to ChromaDB. Total chunks: 15

ChromaDB setup successful. Proceeding with LLaMA initialization and query processing.
Retrieving 3 relevant chunks for query: 'Who is the consultant for Pain Clinic on Wednesdays?'
Retrieved chunks:
  1. Monday to Friday: Morning Clinic 8:00 am - 11:00 am, Evening Clinic 2:00 pm - 3:00 pm. Saturday and ...
  2. Dr. Indu Mohini Sen and Dr. Preethy J Mathew....
  3. Dr. Jeetinder Kaur Makkar and Dr. Neerja Bharti....
Initializing LLaMA model: TinyLlama/TinyLlama-1.1B-Chat-v1.0...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu


LLaMA model initialized.
LLaMA model initialization status: True
Generating answer with LLaMA...
Answer generated.

Generated Answer: 1. Dr. Anoop Gupta (Pain Management)
    2. Dr. Ashutosh Mishra (Oral Maxillofacial Surgery)
    3. Dr. Shruti Sinha (Orthopedics)
    4. Dr. Prasanth Sharma (Urology)
    5. Dr. Ajay Chakraborty (Gynaecology)

    Choice (A): Dr. Anoop Gupta
    Correct Answer: B
Setting up evaluation metrics...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Evaluation metrics ready (BLEU, ROUGE).
Evaluating generated answer...

--- Evaluation Results ---
BLEU: 0.0000
ROUGE-L: 0.0851
F1 Score (token-based): 0.0500
Cosine Similarity: 0.5127
------------------------


In [10]:
# --- RAG Assistant Function ---
def answer_question_rag(user_query, embedding_model, chroma_collection, text_generation_pipeline):
    """
    Takes a user query and returns a generated answer using the RAG pipeline.
    """
    if not embedding_model or not chroma_collection or not text_generation_pipeline:
        return "RAG system components not fully initialized. Please ensure all models and database are loaded."

    print(f"\nUser Query received: '{user_query}'")
    # Step 1: Retrieve relevant chunks
    relevant_docs = retrieve_relevant_chunks(user_query, chroma_collection, embedding_model, n_results=3)

    # Step 2: Generate answer
    generated_answer = generate_answer(user_query, relevant_docs, text_generation_pipeline)

    return generated_answer

In [None]:
# --- Main Pipeline Execution with Gradio Interface ---
if __name__ == "__main__":
    print(f"Attempting to load data from: {CSV_FILE_PATH}")

    # Initialize global variables for RAG components
    global embedding_model, chroma_collection, text_generation_pipeline # Declare as global
    embedding_model = None
    chroma_collection = None
    text_generation_pipeline = None

    # Step 1: Load and Preprocess Data
    hospital_chunks = load_and_preprocess_data(CSV_FILE_PATH)
    if not hospital_chunks:
        print("ERROR: Exiting due to data loading error or empty chunks. Please check CSV_FILE_PATH and ensure the 'Answer' column exists and has content.")
    else:
        # Step 2: Initialize Embedding Model
        embedding_model = initialize_embedding_model(EMBEDDING_MODEL_NAME)

        # Step 3: Setup ChromaDB and Store Data
        chroma_collection = setup_chromadb(CHROMA_DB_PATH, hospital_chunks, embedding_model)

        if chroma_collection:
            print("\nChromaDB setup successful. Proceeding with LLaMA initialization.")
            # Step 5: Initialize LLaMA for Generation
            try:
                text_generation_pipeline = initialize_llama_model(LLAMA_MODEL_NAME)
                print(f"LLaMA model initialization status: {text_generation_pipeline is not None}")
            except Exception as e:
                print(f"ERROR: LLaMA model initialization failed: {e}")
                print("Please double-check `bitsandbytes` installation and ensure runtime was restarted immediately after the initial installation/upgrade.")

            if text_generation_pipeline: # Only launch Gradio if all components initialized
                print("\n--- RAG Assistant Ready! Launching Gradio Interface ---")

                import gradio as gr # Import Gradio here

                # Create a Gradio interface
                # The fn (function) is our RAG logic
                # The inputs are a text box for the query
                # The outputs are a text box for the answer
                iface = gr.Interface(
                    fn=lambda query: answer_question_rag(query, embedding_model, chroma_collection, text_generation_pipeline),
                    inputs=gr.Textbox(lines=2, placeholder="Ask a question about the OPD schedule..."),
                    outputs="text",
                    title="🏥 Hospital Anaesthesia OPD Assistant",
                    description="Ask me anything about the Anaesthesia department's OPD schedule, timings, and consultants.",
                    allow_flagging="never" # Disable flagging feature
                )

                # Launch the Gradio app. share=True creates a public, shareable link.
                iface.launch(debug=True, share=True)
            else:
                print("RAG system components (LLaMA model) not fully initialized. Gradio interface will not be launched.")
        else:
            print("ERROR: ChromaDB setup failed. Gradio interface will not be launched.")

    # --- Evaluation (Optional - you can remove or run separately if you just want the assistant) ---
    # The evaluation part is typically for development and can be run separately
    # or removed when you just want the interactive assistant.
    # For now, let's keep it commented out to focus on the assistant.
    # if embedding_model and text_generation_pipeline:
    #     user_query_eval = "Who is the consultant for Pain Clinic on Wednesdays?"
    #     reference_answer_eval = "Dr. Babita Ghai is the consultant for Pain Clinic on Wednesdays."
    #     print(f"\n--- Running an example evaluation for: '{user_query_eval}' ---")
    #     generated_answer_eval = answer_question_rag(user_query_eval, embedding_model, chroma_collection, text_generation_pipeline)
    #     bleu, rouge = setup_evaluation_metrics()
    #     evaluate_answer(generated_answer_eval, reference_answer_eval, embedding_model, bleu, rouge)

Attempting to load data from: OPD Schedule for Department of ANAESTHESIA.csv
Loading and preprocessing OPD Schedule data...
  - Loading data from: OPD Schedule for Department of ANAESTHESIA.csv
    Loaded 19 chunks from 'OPD Schedule for Department of ANAESTHESIA.csv'.
Total unique chunks loaded: 15
Initializing embedding model: all-MiniLM-L6-v2...
Embedding model initialized.
Setting up ChromaDB at: hospital_english_chroma_db...
ChromaDB collection 'hospital_english_knowledge' ready. Current count: 15
Embedding and adding 15 new chunks to ChromaDB...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Added 15 chunks to ChromaDB. Total chunks: 30

ChromaDB setup successful. Proceeding with LLaMA initialization.
Initializing LLaMA model: TinyLlama/TinyLlama-1.1B-Chat-v1.0...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Device set to use cpu


LLaMA model initialized.
LLaMA model initialization status: True

--- RAG Assistant Ready! Launching Gradio Interface ---




Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://5747ef0c732141b816.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
