In [12]:
%%writefile app_gradio.py

import gradio as gr
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
from peft import PeftModel
import os
import gc

# --- Load HF Token using Colab Secrets ---
hf_token = None
try:
    from google.colab import userdata
    hf_token = userdata.get('HF_TOKEN')
    if hf_token:
        print("HF_TOKEN retrieved successfully from Colab Secrets.")
    else:
        print("WARNING: Colab Secret 'HF_TOKEN' found but might be empty or notebook access is disabled.")
except ImportError:
    print("INFO: 'google.colab userdata' not found. This is expected if not running in Google Colab.")
except Exception as e:
    print(f"An unexpected error occurred trying to access Colab Secrets: {e}")

if not hf_token:
    print("WARNING: Hugging Face token (HF_TOKEN) was not found. Model loading might fail if it's private/gated.")
# --- End Token Loading ---

# Suppress less critical Hugging Face warnings
logging.set_verbosity(logging.ERROR)

# --- Configuration ---
adapter_repo_id = "sujoy0011/kiit-llama2-lora-adapters"
base_model_name = "meta-llama/Llama-2-7b-chat-hf"

# --- Global Variables ---
model = None
tokenizer = None
pipe = None
compute_dtype = torch.float16 # Default
model_loaded_successfully = False # Flag to track loading status

# --- Model Loading Function ---
def load_model_resources():
    """Loads the model, tokenizer, and creates the pipeline."""
    global model, tokenizer, pipe, compute_dtype, model_loaded_successfully

    print("--- Starting Model Loading (Gradio/Colab) ---")

    if not torch.cuda.is_available():
        raise gr.Error("CUDA is not available. Please ensure you selected a GPU runtime in Colab.")

    # Determine compute dtype
    major, minor = torch.cuda.get_device_capability()
    if major >= 8: compute_dtype = torch.bfloat16; dtype_msg = "bfloat16"
    else: compute_dtype = torch.float16; dtype_msg = "float16"
    gpu_name = torch.cuda.get_device_name(0)
    print(f"GPU: {gpu_name}, Capability: compute_{major}{minor}. Using compute dtype: {dtype_msg}")

    # Quantization Config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=True,
    )

    # Load Base Model Quantized
    print(f"Loading base model: {base_model_name} quantized...")
    try:
        model_base = AutoModelForCausalLM.from_pretrained(
            base_model_name, quantization_config=bnb_config, device_map="auto",
            trust_remote_code=True, token=hf_token, low_cpu_mem_usage=True
        )
        print("Base model loaded.")
    except Exception as e:
        error_msg = f"Error loading base model: {e}"
        if "RAM" in str(e).upper(): error_msg += "\n\nHint: Colab RAM limit?"
        elif "authentication" in str(e).lower(): error_msg += "\n\nHint: Check HF Token/Login & Llama 2 terms."
        print(error_msg)
        raise gr.Error(f"Failed to load base model. Error: {e}")

    # Load Tokenizer
    print(f"Loading tokenizer from adapter repo: {adapter_repo_id}...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(adapter_repo_id, token=hf_token)
        print("Tokenizer loaded.")
    except Exception as e:
        print(f"Warning: Could not load tokenizer from adapter repo ({e}). Loading from base model.")
        try:
            tokenizer = AutoTokenizer.from_pretrained(base_model_name, token=hf_token)
            print("Loaded tokenizer from base model instead.")
        except Exception as e_base_tok:
             print(f"Error loading tokenizer: {e_base_tok}")
             raise gr.Error(f"Failed to load tokenizer. Error: {e_base_tok}")

    # Load LoRA Adapters
    print(f"Loading LoRA adapters from repo: {adapter_repo_id}...")
    try:
        model = PeftModel.from_pretrained(model_base, adapter_repo_id, token=hf_token, device_map='auto')
        print("LoRA adapters loaded successfully.")
    except Exception as e:
        error_msg = f"Error loading LoRA adapters from {adapter_repo_id}: {e}"
        if "RAM" in str(e).upper(): error_msg += "\n\nHint: RAM limit?"
        elif "adapter_config.json" in str(e): error_msg += f"\n\nHint: Check '{adapter_repo_id}'."
        print(error_msg)
        raise gr.Error(f"Failed to load LoRA adapters. Error: {e}")

    model.eval()

    # Set up Generation Pipeline
    print("Setting up text generation pipeline...")
    try:
        pipe = pipeline(
            "text-generation", model=model, tokenizer=tokenizer,
            torch_dtype=compute_dtype, device_map="auto"
        )
        print("Pipeline ready.")
    except Exception as e:
        print(f"Error creating pipeline: {e}")
        raise gr.Error(f"Failed to create generation pipeline. Error: {e}")

    print("--- Model loading and pipeline setup complete. ---")
    model_loaded_successfully = True # Set flag
    return True

# --- Initial Model Load ---
model_loaded_successfully = False
try:
    print("Attempting initial model load...")
    load_model_resources()
except Exception as e:
    print(f"FATAL: Initial model loading failed: {e}")

# --- Inference Function ---
def generate_kiit_response(user_prompt):
    global pipe, tokenizer, model_loaded_successfully
    if not model_loaded_successfully or pipe is None or tokenizer is None:
        return "Error: Model resources not loaded. Check logs."

    print(f"\nGenerating response for prompt: {user_prompt[:100]}...")
    system_prompt = "You are a helpful assistant knowledgeable about Kalinga Institute of Industrial Technology (KIIT). Provide concise and accurate information based on the user's question about KIIT."
    llama_prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{user_prompt} [/INST]"

    try:
        sequences = pipe(
            llama_prompt, do_sample=True, top_k=10, num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id, max_new_tokens=350,
        )
        generated_text = sequences[0]['generated_text']
        response_part = generated_text.split('[/INST]')[-1].strip()
        if response_part.endswith(tokenizer.eos_token):
            response_part = response_part[:-len(tokenizer.eos_token)].strip()
        if not response_part: response_part = "Sorry, I couldn't generate a specific response."
        print(f"Generated response: {response_part[:100]}...")
        return response_part
    except Exception as e:
        print(f"Error during text generation: {e}")
        return f"Sorry, an error occurred during generation: {e}"

# --- Gradio Interface Definition ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    # --- UI Title/Description ---
    gr.Markdown("# KIITGPT Chatbot")
    gr.Markdown("Chatbot powered by Llama-2-7B, fine-tuned with LoRA on KIIT information.")
    # --- End UI Title/Description ---

    chatbot = gr.Chatbot(label="Chat History", height=350)
    msg = gr.Textbox(label="Your Question", placeholder="Type your question about KIIT here...")
    clear = gr.Button("Clear Chat")

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):
        user_message = history[-1][0]
        try:
            bot_message = generate_kiit_response(user_message)
        except gr.Error as e: bot_message = f"Error: {e}"
        except Exception as e: bot_message = f"Unexpected error: {e}"
        history[-1][1] = bot_message
        return history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )
    clear.click(lambda: [], None, chatbot, queue=False)

    # --- Footer ---
    footer_html = """
    <div style="text-align: center; margin-top: 20px; font-size: small; color: #888;">
        Made with ❤️ by <a href="https://www.linkedin.com/in/dutta-sujoy/" target="_blank" style="color: #007bff; text-decoration: none;">Sujoy</a>
    </div>
    """
    gr.HTML(footer_html)
    # --- End Footer ---


# --- Launch the Interface ---
if __name__ == "__main__":
    if not model_loaded_successfully or pipe is None:
        print("\nWARNING: Model not loaded correctly. UI launching, but generation will fail.")
    else:
        print("\nModel loaded. Launching Gradio interface...")
    demo.launch(debug=True, share=True) # share=True for Colab public URL


Overwriting app_gradio.py


In [5]:
!pip install -q gradio torch transformers accelerate bitsandbytes peft sentencepiece huggingface_hub importlib-metadata


In [6]:
from huggingface_hub import notebook_login
# Paste your token (hf_...) with 'read' access (or 'write' if needed later).
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
!python app_gradio.py

2025-04-18 11:22:25.919177: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744975345.940034    9908 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744975345.946273    9908 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-18 11:22:25.967477: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
An unexpected error occurred trying to access Colab Secrets: 'NoneType' object has no attribute 'kernel'
Attempting i