In [8]:
from langchain.chat_models import ChatVertexAI
from langchain.schema import HumanMessage
from langchain.schema.messages import SystemMessage
from langchain.schema.document import Document
from PIL import Image
import base64
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/Admin/Data/MultimodalRag_Web_app/serene-craft-464519-j1-963d43d7e20e.json"
# Load ảnh
with open("image.png", "rb") as f:
    image_bytes = f.read()
    image_b64 = base64.b64encode(image_bytes).decode("utf-8")

# Khởi tạo mô hình Gemini Vision
llm = ChatVertexAI(model_name="gemini-1.5-flash", temperature=0.2)

response = llm.invoke([
    SystemMessage(content="You are a helpful assistant that understands images."),
    HumanMessage(content=[
        {"type": "text", "text": "What’s happening in this image?"},
        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
    ])
])

print(response.content)


PydanticUserError: `ChatVertexAI` is not fully defined; you should define `_LanguageModel`, then call `ChatVertexAI.model_rebuild()`.

For further information visit https://errors.pydantic.dev/2.11/u/class-not-fully-defined

In [None]:
import google.generativeai as genai
import base64
from PIL import Image
from io import BytesIO
import os

# Config API Key
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

# Load ảnh
def image_to_bytes(path):
    with Image.open(path) as img:
        buf = BytesIO()
        img.save(buf, format='PNG')
        return buf.getvalue()

image_bytes = image_to_bytes("image.png")

# Khởi tạo Gemini Vision model
model = genai.GenerativeModel("gemini-1.5-flash")

# Gửi ảnh và câu hỏi
response = model.generate_content([
    {"mime_type": "image/png", "data": image_bytes},
    "What is in this image?"
])

print(response.text)


This image is a combined bar and line graph showing the average monthly temperature and rainfall.  The bars represent the average monthly temperature in degrees (the scale is not explicitly labeled but appears to be in Celsius or Fahrenheit), while the line shows the average monthly rainfall (likely in millimeters or inches, the scale on the right y-axis is 0 to 400).  The graph covers the twelve months of the year, from January to December.  The data suggests a warmer climate with higher rainfall peaking in the late summer/early autumn months.



In [3]:
import base64

import httpx
from langchain.chat_models import init_chat_model
import os
# Fetch audio data
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/Admin/Data/MultimodalRag_Web_app/serene-craft-464519-j1-963d43d7e20e.json"

with open("image.png", "rb") as f:
    image_bytes = f.read()
    image_b64 = base64.b64encode(image_bytes).decode("utf-8")

# Pass to LLM
llm = init_chat_model("google_genai:gemini-2.0-flash-001")

message = {
    "role": "user",
    "content": [
        {
            "type": "text",
            "text": "Describe the weather in this image:",
        },
        # highlight-start
        {
            "type": "image",
            "source_type": "base64",
            "data": image_b64,
            "mime_type": "image/png",
        },
        # highlight-end
    ],
}
response = llm.invoke([message])
print(response.text())

Based on the graph, the weather pattern is characterized by:

*   **Temperature:** Temperatures are lowest in January and February, gradually increasing to peak in July and August. After August, temperatures start to decline again towards the end of the year.
*   **Rainfall:** Rainfall is relatively low in the earlier months (January-March), increases to a peak around June, and then decreases towards the end of the year.
In short, the area experiences warm, wet summers and cool, dry winters.


In [None]:
import fitz  # PyMuPDF
import os

def extract_images_from_pdf(pdf_path, output_folder):
    doc = fitz.open(pdf_path)
    image_paths = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"page_{page_num+1}_img_{img_index+1}.{image_ext}"
            image_path = os.path.join(output_folder, image_filename)

            with open(image_path, "wb") as img_file:
                img_file.write(image_bytes)

            image_paths.append(image_path)

    doc.close()
    return image_paths

In [None]:
from google.cloud import storage

def upload_to_gcs(bucket_name, source_file_path, destination_blob_name):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_path)

    # Trả về public URL
    return blob.public_url

In [None]:
import fitz  # PyMuPDF
import os
from google.cloud import storage
# from pdfplumber.pdf import PDF as pdfplumber_PDF


# =================== BƯỚC 1: TRÍCH XUẤT ẢNH TỪ PDF =================== #
def extract_images_from_pdf(pdf_path, output_folder):
    """
    Trích xuất tất cả ảnh trong file PDF và lưu vào thư mục local
    """
    doc = fitz.open(pdf_path)
    image_paths = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"page_{page_num+1}_img_{img_index+1}.{image_ext}"
            image_path = os.path.join(output_folder, image_filename)

            with open(image_path, "wb") as img_file:
                img_file.write(image_bytes)

            image_paths.append({
                'path': image_path,
                'page_num': page_num + 1,
                'img_index': img_index + 1
            })

    doc.close()
    return image_paths


# =================== BƯỚC 2: LẤY TEXT GẦN ẢNH NHẤT =================== #
def get_text_near_image(pdf_path, page_num, image_bbox):
    """
    Tìm text gần vị trí ảnh nhất để làm tên/caption
    image_bbox: [x0, y0, x1, y1] - tọa độ bounding box của ảnh
    """
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_num - 1)
    text_dict = page.get_text("dict")
    blocks = text_dict.get("blocks", [])

    nearby_texts = []
    for block in blocks:
        if "lines" in block and block["type"] == 0:  # Chỉ văn bản
            for line in block["lines"]:
                for span in line["spans"]:
                    span_rect = fitz.Rect(span["bbox"])
                    if image_bbox.intersects(span_rect):
                        nearby_texts.append(span["text"])

    doc.close()
    return " ".join(nearby_texts[:5]) or "No description found"


# =================== BƯỚC 3: UPLOAD VÀO GCS =================== #
def upload_to_gcs(bucket_name, source_file_path, destination_blob_name):
    """
    Upload file lên Google Cloud Storage và trả về public URL
    """
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_path)

    return blob.public_url


# =================== BƯỚC 4: HÀM CHẠY TOÀN BỘ QUY TRÌNH =================== #
def process_pdf(pdf_path, output_folder, gcs_bucket):
    """
    Chạy toàn bộ quy trình:
    - Trích xuất ảnh
    - Lấy mô tả gần ảnh nhất
    - Upload lên GCS
    - Trả về danh sách ảnh với mô tả và URL
    """
    os.makedirs(output_folder, exist_ok=True)

    print("[+] Bắt đầu trích xuất ảnh từ PDF...")
    image_info_list = extract_images_from_pdf(pdf_path, output_folder)

    results = []

    print(f"[+] Đang xử lý {len(image_info_list)} ảnh...")
    for idx, info in enumerate(image_info_list):
        image_path = info['path']
        page_num = info['page_num']
        img_index = info['img_index']

        try:
            # Giả sử không có bbox thực tế => dùng bbox mặc định (giữa trang)
            doc = fitz.open(pdf_path)
            page = doc.load_page(page_num - 1)
            image_bbox = page.rect  # giả định ảnh ở giữa trang
            doc.close()

            print(f"[+] Phân tích nội dung cho ảnh {idx+1}...")
            caption = get_text_near_image(pdf_path, page_num, image_bbox)
        except Exception as e:
            print(f"[!] Lỗi phân tích nội dung ảnh {idx+1}: {e}")
            caption = "No description found"

        try:
            print(f"[+] Upload ảnh {idx+1} lên GCS...")
            destination_blob = f"images/{os.path.basename(image_path)}"
            image_url = upload_to_gcs(gcs_bucket, image_path, destination_blob)

            results.append({
                "caption": caption,
                "gcs_url": image_url,
                "filename": os.path.basename(image_path)
            })
        except Exception as e:
            print(f"[!] Lỗi upload ảnh {idx+1}: {e}")

    return results


# =================== BƯỚC 5: CHẠY DEMO =================== #
if __name__ == "__main__":
    PDF_PATH = "sample.pdf"               # Đường dẫn đến file PDF của bạn
    OUTPUT_FOLDER = "./extracted_images"  # Thư mục chứa ảnh sau khi trích xuất
    GCS_BUCKET_NAME = "your-gcs-bucket"   # Tên bucket trên GCP

    result_list = process_pdf(PDF_PATH, OUTPUT_FOLDER, GCS_BUCKET_NAME)

    print("\n\n✅ Hoàn tất xử lý. Kết quả:")
    for res in result_list:
        print(f"- Tên ảnh: {res['filename']}")
        print(f"  Mô tả: {res['caption']}")
        print(f"  URL: {res['gcs_url']}\n")

In [2]:
import os
from google.cloud import storage
from google.api_core.exceptions import Conflict, NotFound
import fitz  # PyMuPDF

# Set biến môi trường xác thực GCP
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/Admin/Data/MultimodalRag_Web_app/serene-craft-464519-j1-963d43d7e20e.json"


# =================== HÀM: TẠO BUCKET NẾU CHƯA CÓ =================== #
def create_bucket_if_not_exists(bucket_name, location="asia-southeast1"):
    """
    Kiểm tra xem bucket có tồn tại không.
    Nếu chưa có → tự động tạo mới.
    """
    client = storage.Client()
    try:
        bucket = client.get_bucket(bucket_name)
        print(f"[+] Bucket '{bucket_name}' đã tồn tại.")
        return bucket
    except NotFound:
        print(f"[-] Bucket '{bucket_name}' chưa tồn tại. Đang tạo mới...")
        try:
            bucket = client.create_bucket(bucket_name, location=location)
            print(f"[+] Bucket '{bucket_name}' đã được tạo thành công!")
            return bucket
        except Conflict as e:
            print(f"[!] Lỗi: Có thể bucket đã được tạo bởi người khác hoặc tên bị trùng?")
            raise e
    except Exception as e:
        print(f"[!] Lỗi khi kiểm tra bucket: {e}")
        raise e


# =================== HÀM: UPLOAD ẢNH LÊN GCS =================== #
def upload_to_gcs(bucket_name, source_file_path, destination_blob_name):
    """
    Upload file lên Google Cloud Storage và trả về public URL
    """
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_path)
    return blob.public_url


# =================== HÀM: TRÍCH XUẤT ẢNH TỪ PDF =================== #
def extract_images_from_pdf(pdf_path, output_folder):
    """
    Trích xuất tất cả ảnh trong file PDF và lưu vào thư mục local
    """
    doc = fitz.open(pdf_path)
    image_paths = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"page_{page_num+1}_img_{img_index+1}.{image_ext}"
            image_path = os.path.join(output_folder, image_filename)

            with open(image_path, "wb") as img_file:
                img_file.write(image_bytes)

            image_paths.append({
                'path': image_path,
                'page_num': page_num + 1,
                'img_index': img_index + 1
            })

    doc.close()
    return image_paths


# =================== HÀM: TÌM TEXT GẦN ẢNH NHẤT =================== #
def get_text_near_image(pdf_path, page_num, image_bbox):
    """
    Tìm text gần vị trí ảnh nhất để làm caption/mô tả
    image_bbox: [x0, y0, x1, y1] - tọa độ bounding box của ảnh
    """
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_num - 1)
    text_dict = page.get_text("dict")
    blocks = text_dict.get("blocks", [])

    nearby_texts = []
    for block in blocks:
        if "lines" in block and block["type"] == 0:  # Chỉ văn bản thường
            for line in block["lines"]:
                for span in line["spans"]:
                    span_rect = fitz.Rect(span["bbox"])
                    if image_bbox.intersects(span_rect):
                        nearby_texts.append(span["text"])

    doc.close()
    return " ".join(nearby_texts[:5]) or "No description found"


# =================== HÀM: XỬ LÝ TOÀN BỘ FILE PDF =================== #
def process_pdf(pdf_path, output_folder, gcs_bucket):
    """
    Quy trình xử lý PDF:
    - Trích xuất ảnh
    - Tạo mô tả ảnh dựa trên nội dung xung quanh
    - Upload lên GCS
    - Trả về danh sách ảnh với mô tả và URL
    """
    os.makedirs(output_folder, exist_ok=True)

    print("[+] Bắt đầu trích xuất ảnh từ PDF...")
    image_info_list = extract_images_from_pdf(pdf_path, output_folder)

    results = []

    print(f"[+] Đang xử lý {len(image_info_list)} ảnh...")
    for idx, info in enumerate(image_info_list):
        image_path = info['path']
        page_num = info['page_num']

        try:
            doc = fitz.open(pdf_path)
            page = doc.load_page(page_num - 1)
            image_bbox = page.rect  # Giả định ảnh ở giữa trang
            doc.close()

            print(f"[+] Phân tích nội dung cho ảnh {idx+1}...")
            caption = get_text_near_image(pdf_path, page_num, image_bbox)
        except Exception as e:
            print(f"[!] Lỗi phân tích nội dung ảnh {idx+1}: {e}")
            caption = "No description found"

        try:
            print(f"[+] Upload ảnh {idx+1} lên GCS...")
            destination_blob = f"images/{os.path.basename(image_path)}"
            image_url = upload_to_gcs(gcs_bucket, image_path, destination_blob)

            results.append({
                "caption": caption,
                "gcs_url": image_url,
                "filename": os.path.basename(image_path)
            })
        except Exception as e:
            print(f"[!] Lỗi upload ảnh {idx+1}: {e}")

    return results


# =================== DEMO CHẠY THỬ =================== #
if __name__ == "__main__":
    PDF_PATH = "C:/Users/Admin/Data/MultimodalRag_Web_app/src/ml/tests/sample/2c98e99a08ec5392d50e60370d871319.pdf"              
    OUTPUT_FOLDER = "./extracted_images"  # Thư mục chứa ảnh sau khi trích xuất
    GCS_BUCKET_NAME = "my-multimodalrag-images"  # Thay bằng tên bucket thực tế

    # Bước 1: Tạo bucket nếu chưa có
    create_bucket_if_not_exists(GCS_BUCKET_NAME)

    # Bước 2: Xử lý PDF và upload ảnh lên GCS
    result_list = process_pdf(PDF_PATH, OUTPUT_FOLDER, GCS_BUCKET_NAME)

    print("\n\n✅ Hoàn tất xử lý. Kết quả:")
    for res in result_list:
        print(f"- Tên ảnh: {res['filename']}")
        print(f"  Mô tả: {res['caption']}")
        print(f"  URL: {res['gcs_url']}\n")

[-] Bucket 'my-multimodalrag-images' chưa tồn tại. Đang tạo mới...
[+] Bucket 'my-multimodalrag-images' đã được tạo thành công!
[+] Bắt đầu trích xuất ảnh từ PDF...
[+] Đang xử lý 4 ảnh...
[+] Phân tích nội dung cho ảnh 1...
[+] Upload ảnh 1 lên GCS...
[+] Phân tích nội dung cho ảnh 2...
[+] Upload ảnh 2 lên GCS...
[+] Phân tích nội dung cho ảnh 3...
[+] Upload ảnh 3 lên GCS...
[+] Phân tích nội dung cho ảnh 4...
[+] Upload ảnh 4 lên GCS...


✅ Hoàn tất xử lý. Kết quả:
- Tên ảnh: page_1_img_1.jpeg
  Mô tả: List   of   accolades
  URL: https://storage.googleapis.com/my-multimodalrag-images/images/page_1_img_1.jpeg

- Tên ảnh: page_1_img_2.jpeg
  Mô tả: List   of   accolades
  URL: https://storage.googleapis.com/my-multimodalrag-images/images/page_1_img_2.jpeg

- Tên ảnh: page_1_img_3.png
  Mô tả: List   of   accolades
  URL: https://storage.googleapis.com/my-multimodalrag-images/images/page_1_img_3.png

- Tên ảnh: page_8_img_1.png
  Mô tả: Privacy   policy About  
  URL: https://storage