In [None]:
!pip install gradio vllm transformers triton PyMuPDF Pillow sentence_transformers numpy typing open_clip_torch numpy faiss-cpu translatepy
!python -m spacy download en_core_web_sm


Collecting gradio
  Downloading gradio-5.7.1-py3-none-any.whl.metadata (16 kB)
Collecting vllm
  Downloading vllm-0.6.4.post1-cp38-abi3-manylinux1_x86_64.whl.metadata (10 kB)
Collecting triton
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting PyMuPDF
  Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting typing
  Downloading typing-3.7.4.3.tar.gz (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting open_clip_torch
  Downloading open_clip_torch-2.29.0-py3-none-any.whl.metadata (31 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting translatepy
  Downloading translatepy-2.3-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles

In [None]:
import torch
from vllm import LLM, SamplingParams
import spacy
from sentence_transformers import SentenceTransformer
from PIL import Image

def get_prompt_for_llava_next(question: str, modality: str):
    """Prepare a prompt for LLaVA-NeXT."""
    assert modality == "image", "Only 'image' modality is supported."
    prompt = f"[INST] <image>\n{question} [/INST]"
    stop_token_ids = None
    return prompt, stop_token_ids

def initialize_llm():
    """Initialize the LLaVA model."""
    print("Loading LLaVA model...")
    try:
        llm = LLM(
            model="llava-hf/llava-v1.6-mistral-7b-hf",
            dtype='half',
            max_model_len=8192
        )
        print("LLaVA model loaded successfully!")
        return llm
    except Exception as e:
        print(f"Error occurred while initializing LLaVA model: {str(e)}")
        return None

def initialize_spacy():
    print("Loading SpaCy model...")
    try:
        nlp = spacy.load("en_core_web_sm")
        print("SpaCy model loaded successfully!")
        return nlp
    except Exception as e:
        print(f"Error occurred while initializing SpaCy model: {str(e)}")
        return None

def initialize_sentence_transformer():
    """Initialize the SentenceTransformer model."""
    print("Loading SentenceTransformer...")
    try:
        embedder = SentenceTransformer('all-MiniLM-L6-v2')
        print("SentenceTransformer loaded successfully!")
        return embedder
    except Exception as e:
        print(f"Error occurred while initializing SentenceTransformer: {str(e)}")
        return None

def initialize_models():
    """Initialize all models and return them."""
    llm = initialize_llm()
    nlp = initialize_spacy()
    embedder = initialize_sentence_transformer()

    return llm, nlp, embedder

# Initialize models
print("Starting model initialization...")
GLOBAL_LLM, GLOBAL_NLP, GLOBAL_EMBEDDER = initialize_models()

# Check initialization status
if not GLOBAL_LLM:
    raise ValueError("Failed to initialize LLaVA model.")

print("All models have been successfully initialized!")


Starting model initialization...
Loading LLaVA model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

INFO 12-02 20:26:12 config.py:350] This model supports multiple tasks: {'embedding', 'generate'}. Defaulting to 'generate'.
INFO 12-02 20:26:12 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='llava-hf/llava-v1.6-mistral-7b-hf', speculative_config=None, tokenizer='llava-hf/llava-v1.6-mistral-7b-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=llava-hf/llav

tokenizer_config.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

INFO 12-02 20:26:13 selector.py:135] Using Flash Attention backend.
INFO 12-02 20:26:14 model_runner.py:1072] Starting to load model llava-hf/llava-v1.6-mistral-7b-hf...
INFO 12-02 20:26:15 weight_utils.py:243] Using model weights format ['*.safetensors']


model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.2k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 12-02 20:28:17 model_runner.py:1077] Loading model weights took 14.0785 GB
INFO 12-02 20:28:20 worker.py:232] Memory profiling results: total_gpu_memory=39.56GiB initial_memory_usage=14.62GiB peak_torch_memory=14.94GiB memory_usage_post_profile=14.66GiB non_torch_memory=0.57GiB kv_cache_size=20.09GiB gpu_memory_utilization=0.90
INFO 12-02 20:28:21 gpu_executor.py:113] # GPU blocks: 10288, # CPU blocks: 2048
INFO 12-02 20:28:21 gpu_executor.py:117] Maximum concurrency for 8192 tokens per request: 20.09x
INFO 12-02 20:28:23 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 12-02 20:28:23 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
I

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer loaded successfully!
All models have been successfully initialized!


In [1]:
import gradio as gr
from vllm.sampling_params import SamplingParams
import fitz  # PyMuPDF
import faiss
import torch
from PIL import Image
from translatepy import Translator

# Translator 초기화
translator = Translator()

# 이미지 처리
def process_image(image):
    """Generate image description using LLaVA-NeXT."""
    try:
        question = "What is the content of this image?"
        prompt, stop_token_ids = get_prompt_for_llava_next(question, "image")
        inputs = {
            "prompt": prompt,
            "multi_modal_data": { "image": image },
        }
        return inputs, stop_token_ids
    except Exception as e:
        return f"Error processing image: {str(e)}", None

def process_pdf(pdf_file):
    """Extract text from a PDF file."""
    try:
        # PDF 파일 열기
        doc = fitz.open(pdf_file.name)
        text = ""
        # 페이지별로 텍스트 추출
        for page_num, page in enumerate(doc):
            text += f"Page {page_num + 1}:\n" + page.get_text("text") + "\n\n"
        # 텍스트가 없을 경우 처리
        if not text.strip():
            return "No text found in the PDF."
        return text
    except Exception as e:
        return f"Error processing PDF: {str(e)}"

# PDF 텍스트를 Chunk 단위로 나누기
def chunk_pdf_text(pdf_text):
    """Chunk PDF text into sentences."""
    doc = GLOBAL_NLP(pdf_text)
    chunks = [sent.text for sent in doc.sents]
    return chunks

# PDF Chunk 임베딩 생성 및 검색
def create_embeddings(chunks):
    """Create embeddings for chunks."""
    embeddings = GLOBAL_EMBEDDER.encode(chunks)
    return embeddings

def index_chunks(embeddings):
    """Index chunk embeddings using FAISS."""
    dimension = len(embeddings[0])
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

def retrieve_top_k(index, query, chunks, k=3):
    """Retrieve top-k chunks using FAISS."""
    query_embedding = GLOBAL_EMBEDDER.encode([query])
    distances, indices = index.search(query_embedding, k)
    return [chunks[i] for i in indices[0]]

# Gradio 응답 생성 함수
def generate_response(user_input, image, pdf, use_translation, history):
    try:
        # 번역 기능 사용 여부에 따라 입력 처리
        if use_translation:
            english_input = translate_text(user_input, src="ko", dest="en")
        else:
            english_input = user_input  # 번역 없이 그대로 사용

        # PDF 처리
        pdf_text = ""
        retrieved_chunks = ""
        if pdf is not None:
            pdf_text = process_pdf(pdf)
            history.append(f"PDF Content: {pdf_text}")
            chunks = chunk_pdf_text(pdf_text)
            embeddings = create_embeddings(chunks)
            index = index_chunks(embeddings)
            retrieved_chunks = retrieve_top_k(index, english_input, chunks)
            history.append(f"Relevant PDF Chunks: {retrieved_chunks}")

        # 이미지 처리
        image_feedback = ""
        if image is not None:
            inputs, stop_token_ids = process_image(image)
            if isinstance(inputs, str):  # 이미지 처리 중 오류 발생 시
                return inputs, history

            sampling_params = SamplingParams(
                temperature=0.2,
                max_tokens=128,
                stop_token_ids=stop_token_ids
            )
            outputs = GLOBAL_LLM.generate(inputs, sampling_params=sampling_params)
            image_feedback = outputs[0].outputs[0].text.strip()
            history.append(f"Image Description: {image_feedback}")

        # Add current question to history
        history.append(f"User: {english_input}")

        # Combine history into a single text
        history_text = "\n".join(history)

        # Create prompt
        prompt = f"""
You are a professional AI assistant. Answer the following question based on the context provided:
{history_text}
Answer:
"""

        # Generate response using LLaVA model
        sampling_params = SamplingParams(
            temperature=0.3,
            max_tokens=500
        )
        outputs = GLOBAL_LLM.generate({"prompt": prompt}, sampling_params=sampling_params)
        english_response = outputs[0].outputs[0].text.strip()

        if use_translation:
            korean_response = translate_text(english_response, src="en", dest="ko")
        else:
            korean_response = english_response

        history.append(f"AI: {english_response}")

        return korean_response, history
    except Exception as e:
        return f"An error occurred: {str(e)}", history

# Gradio 인터페이스 실행
def chat_interface():
    with gr.Blocks() as interface:
        gr.Markdown("## 🤖 AI 챗봇 🤖")

        with gr.Row():
            user_input = gr.Textbox(label="Your Question", placeholder="Ask a question...", lines=1)
            use_translation = gr.Checkbox(label="Use Translation", value=False)

        with gr.Row():
            image_input = gr.Image(label="Upload Image (optional)", type="pil")
            pdf_input = gr.File(label="Upload PDF (optional)", file_types=[".pdf"])

        with gr.Row():
            response_output = gr.Textbox(label="Chatbot Response", lines=15, max_lines=25, interactive=False)

        # Add history state
        history_state = gr.State([])

        user_input.submit(
            generate_response, inputs=[user_input, image_input, pdf_input, use_translation, history_state], outputs=[response_output, history_state]
        )
        send_button = gr.Button("Generate Response")
        send_button.click(
            generate_response, inputs=[user_input, image_input, pdf_input, use_translation, history_state], outputs=[response_output, history_state]
        )

    return interface

# Gradio 실행
if __name__ == "__main__":
    print(f"LLaVA Model Loaded: {bool(GLOBAL_LLM)}")
    chat_app = chat_interface()
    chat_app.launch()


ModuleNotFoundError: No module named 'gradio'

In [None]:
!pip install --upgrade vllm transformers


Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m76.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.46.2
    Uninstalling transformers-4.46.2:
      Successfully uninstalled transformers-4.46.2
Successfully installed transformers-4.46.3
