### Setup

In [2]:
!pip install --no-deps -q bitsandbytes

from google.colab import drive

drive.mount('/content/drive')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


In [3]:
!pip install addict transformers==4.46.3 tokenizers==0.20.3 supervision open-clip-torch

Collecting addict
  Downloading addict-2.4.0-py3-none-any.whl.metadata (1.0 kB)
Collecting transformers==4.46.3
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers==0.20.3
  Downloading tokenizers-0.20.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting supervision
  Downloading supervision-0.27.0-py3-none-any.whl.metadata (13 kB)
Collecting open-clip-torch
  Downloading open_clip_torch-3.2.0-py3-none-any.whl.metadata (32 kB)
Collecting ftfy (from open-clip-torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m100.7 MB/s[

### OCR

In [4]:
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = 'deepseek-ai/DeepSeek-OCR'

In [5]:

from google.colab import drive

drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
"""
Scene Graph Pipeline: Encoder (Object Detection) + Decoder (Relationship Generation)
Processes images to detect objects, then generates relationships between them.
All logs saved to timestamped folder: log_YYMMDDHH
"""

import os
import json
import torch
import itertools
from datetime import datetime
from pathlib import Path
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
from PIL import Image
import sys


# ============================================================================
# CONFIGURATION
# ============================================================================

# Input/Output Paths
os.chdir('/content/drive/MyDrive/AER1515_Assignment1/aer1515_project/')
IMAGE_INPUT_DIR = "./frame00180_test"  # Directory containing input images (jpg/png)
OUTPUT_BASE_DIR = "./frame00180_test/logs"  # Base directory for logs

# Model Configuration
MODEL_NAME = 'deepseek-ai/DeepSeek-OCR'

# Generation Parameters
MAX_NEW_TOKENS = 64
TEMPERATURE = 0.0

# Relationship Keywords for Extraction
RELATION_KEYWORDS = [
    "on top of", "under", "next to", "beside", "in front of",
    "behind", "inside", "above", "below", "around",
    "attached to", "leaning against"
]

# ============================================================================
# SETUP LOGGING DIRECTORY
# ============================================================================

def setup_log_directory():
    """Create timestamped log directory."""
    timestamp = datetime.now().strftime("%y%m%d%H")
    log_dir = Path(OUTPUT_BASE_DIR) / f"log_{timestamp}"
    log_dir.mkdir(parents=True, exist_ok=True)

    # Create subdirectories
    (log_dir / "encoder_outputs").mkdir(exist_ok=True)
    (log_dir / "decoder_outputs").mkdir(exist_ok=True)
    (log_dir / "visualizations").mkdir(exist_ok=True)

    return log_dir

# ============================================================================
# ENCODER: OBJECT DETECTION WITH BOUNDING BOXES
# ============================================================================
import re
import ast
import json

def detections_to_json(res: str):
    """
    Parse strings like:
      <|ref|>Cabinet<|/ref|><|det|>[[472, 0, 810, 380]]<|/det|>
    into a dict:
      {
        "object_0": {"id": 0, "object_tag": "Cabinet", "bbox": [472, 0, 810, 380]},
        ...
      }
    """
    pattern = r"<\|ref\|>(.*?)<\|/ref\|><\|det\|>(\[\[.*?\]\])<\|/det\|>"
    matches = re.findall(pattern, res, flags=re.DOTALL)

    data = {}
    for i, (name, bbox_str) in enumerate(matches):
        try:
            # Expecting something like [[x1, y1, x2, y2]]
            bbox = ast.literal_eval(bbox_str)[0]
        except (SyntaxError, ValueError, IndexError):
            # Skip malformed entries
            continue

        data[f"object_{i}"] = {
            "id": i,
            "object_tag": name.lower(),
            "bbox": bbox,
            # dummy 3D info, you can add:
            "bbox_extent": [],
            "bbox_center": [],
            "bbox_volume": -1,
        }
        # print(data)

    return data



class ObjectEncoder:
    """Detects objects in images and outputs bounding boxes."""

    def __init__(self, model_name, log_dir):
        self.log_dir = log_dir
        self.log_file = open(log_dir / "encoder_log.txt", "w")
        self.log("Initializing Object Encoder...")

        # Setup quantization config
        qc = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float
        )

        # Load model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name, trust_remote_code=True
        )
        self.model = AutoModel.from_pretrained(
            model_name,
            trust_remote_code=True,
            use_safetensors=True,
            device_map="auto",
            quantization_config=qc,
            torch_dtype=torch.float
        )
        self.model = self.model.eval()
        self.log("Encoder model loaded successfully!")

    def log(self, message):
        """Log message to both console and file."""
        print(f"[ENCODER] {message}")
        self.log_file.write(f"{message}\n")
        self.log_file.flush()

    def process_image(self, image_path, output_name):
        """Process single image to detect objects with bounding boxes."""
        self.log(f"\nProcessing: {image_path}")

        prompt = "<image>\nIdentify all objects in the image and output them in bounding boxes."

        output_dir = self.log_dir / "encoder_outputs" / output_name
        output_dir.mkdir(exist_ok=True)

        try:
            log_buffer = StringIO()
            with contextlib.redirect_stdout(log_buffer), contextlib.redirect_stderr(log_buffer):
              with torch.no_grad():
                  res = self.model.infer(
                      self.tokenizer,
                      prompt=prompt,
                      image_file=str(image_path),
                      output_path=str(output_dir),
                      base_size=1024,
                      image_size=1024,
                      crop_mode=False,
                      save_results=True,
                      test_compress=False,
                      eval_mode=False
                  )

            self.log(f"✓ Successfully processed {image_path.name}")
            log_text = log_buffer.getvalue()
            # Save result text
            result_file = output_dir / "detection_result.txt"
            with open(result_file, "w") as f:
                f.write(log_text)
            parsed = detections_to_json(log_text)
            detected_obj = detections_to_json(log_text)

            result_json_file = output_dir / "detection_result.json"
            with open(result_json_file, "w") as f:
                json.dump(detections_to_json(log_text), f, indent=2)

            return res, output_dir, detections_to_json(log_text)

        except Exception as e:
            self.log(f"✗ Error processing {image_path.name}: {str(e)}")
            return None, None, {}

    def process_directory(self, image_dir):
        """Process all images in directory."""
        image_dir = Path(image_dir)
        image_files = list(image_dir.glob("*.jpg")) + list(image_dir.glob("*.png"))

        self.log(f"\nFound {len(image_files)} images to process")

        results = {}
        object_col = {}
        for img_file in image_files:
            output_name = img_file.stem
            res, output_dir, data = self.process_image(img_file, output_name)
            if res is not None:
                object_col.update(data)
                results[output_name] = {
                    "result": res,
                    "output_dir": output_dir
                }

        result_json_file =  image_dir/ "object_result.json"
        with open(result_json_file, "w") as f:
          json.dump(parsed, f, indent=2)
        self.log(f"\n✓ Encoder complete: {len(results)}/{len(image_files)} images processed")
        return results

    def close(self):
        """Close log file."""
        self.log_file.close()

# ============================================================================
# DECODER: RELATIONSHIP GENERATION
# ============================================================================
import contextlib
from io import StringIO
class RelationshipDecoder:
    """Generates relationships between detected objects."""

    def __init__(self, model_name, log_dir):
        self.log_dir = log_dir
        self.log_file = open(log_dir / "decoder_log.txt", "w")
        self.log("Initializing Relationship Decoder...")

        # Setup quantization config
        qc = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float
        )

        # Load model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name, trust_remote_code=True
        )
        self.model = AutoModel.from_pretrained(
            model_name,
            trust_remote_code=True,
            use_safetensors=True,
            device_map="auto",
            quantization_config=qc,
            torch_dtype=torch.float
        )
        self.model = self.model.eval()
        self.log("Decoder model loaded successfully!")

    def log(self, message):
        """Log message to both console and file."""
        print(f"[DECODER] {message}")
        self.log_file.write(f"{message}\n")
        self.log_file.flush()

    def build_pair_prompt(self, obj_a, obj_b):
        """Build prompt for relationship generation between two objects."""
        return f"""You are a vision-language model reasoning about an indoor scene.

Object A:
- name: {obj_a['name']}
- tag: {obj_a['tag']}
- description: {obj_a['caption'] or "no extra description"}
- center (x, y, z): {obj_a['center']}
- extent (dx, dy, dz): {obj_a['extent']}

Object B:
- name: {obj_b['name']}
- tag: {obj_b['tag']}
- description: {obj_b['caption'] or "no extra description"}
- center (x, y, z): {obj_b['center']}
- extent (dx, dy, dz): {obj_b['extent']}

Question:
What is the relationship between Object A and Object B in the scene?
Please generate a single, natural language caption describing their relationship,
focusing on spatial or functional relationships. Be concise.
"""

    @torch.no_grad()
    def generate_caption(self, prompt):
        """Generate relationship caption using decoder-only mode."""
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        out = self.model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
            temperature=TEMPERATURE,
            pad_token_id=self.tokenizer.eos_token_id,
        )

        generated = out[0]
        full_text = self.tokenizer.decode(generated, skip_special_tokens=True)

        # Strip original prompt
        if full_text.startswith(prompt):
            return full_text[len(prompt):].strip()
        return full_text.strip()

    def extract_relationship(self, caption):
        """Extract relationship keyword from caption."""
        text = caption.lower()
        for kw in RELATION_KEYWORDS:
            if kw in text:
                return kw
        return "related to"

    def process_objects(self, obj_json_path):
        """Process object JSON to generate relationships."""
        self.log(f"\nLoading objects from: {obj_json_path}")

        # Load objects
        with open(obj_json_path, "r") as f:
            obj_dict = json.load(f)

        # Convert to list
        objects = []
        for key, info in obj_dict.items():
            objects.append({
                "name": key,
                "id": info.get("id"),
                "tag": info.get("object_tag", ""),
                "caption": info.get("object_caption", ""),
                "center": info.get("bbox_center", None),
                "extent": info.get("bbox_extent", None),
            })

        self.log(f"Loaded {len(objects)} objects")

        # Generate relationships for all pairs
        edges = {}
        edge_id = 0

        total_pairs = len(list(itertools.combinations(objects, 2)))
        self.log(f"Generating relationships for {total_pairs} object pairs...")

        for i, (obj_a, obj_b) in enumerate(itertools.combinations(objects, 2)):
            if (i + 1) % 10 == 0:
                self.log(f"Progress: {i+1}/{total_pairs} pairs processed")

            prompt = self.build_pair_prompt(obj_a, obj_b)
            caption = self.generate_caption(prompt)
            relationship = self.extract_relationship(caption)

            edge_key = f"edge_{edge_id}"
            edges[edge_key] = {
                "edge_id": edge_id,
                "edge_description": caption,
                "num_detections": 1,
                "object_1_id": obj_a["id"],
                "object_1_tag": obj_a["tag"],
                "object_2_id": obj_b["id"],
                "object_2_tag": obj_b["tag"],
                "relationship": relationship,
            }

            self.log(f"{edge_key}: {obj_a['tag']} <-> {obj_b['tag']} | {relationship}")
            edge_id += 1

        # Save edges
        output_path = self.log_dir / "decoder_outputs" / "relationships.json"
        with open(output_path, "w") as f:
            json.dump(edges, f, indent=2)

        self.log(f"\n✓ Decoder complete: {len(edges)} relationships saved to {output_path}")
        return edges

    def close(self):
        """Close log file."""
        self.log_file.close()

# ============================================================================
# MAIN PIPELINE
# ============================================================================

def main():
    """Run complete pipeline: Encoder → Decoder."""
    print("=" * 80)
    print("SCENE GRAPH PIPELINE")
    print("=" * 80)

    # Setup logging
    log_dir = setup_log_directory()
    print(f"\nLog directory: {log_dir}")

    if True:
      # Check if input directory exists
      if not Path(IMAGE_INPUT_DIR).exists():
          print(f"\n✗ Error: Input directory not found: {IMAGE_INPUT_DIR}")
          print("Please create the directory and add your images (jpg/png)")
          return

      # === PHASE 1: ENCODER (Object Detection) ===
      print("\n" + "=" * 80)
      print("PHASE 1: OBJECT DETECTION (ENCODER)")
      print("=" * 80)

      encoder = ObjectEncoder(MODEL_NAME, log_dir)
      encoder_results = encoder.process_directory(IMAGE_INPUT_DIR)
      encoder.close()

      if not encoder_results:
          print("\n✗ No images were successfully processed by encoder")
          return

    if True:
          # === PHASE 2: DECODER (Relationship Generation) ===
          print("\n" + "=" * 80)
          print("PHASE 2: RELATIONSHIP GENERATION (DECODER)")
          print("=" * 80)

          # For now, use the provided object JSON
          # In a full pipeline, you'd parse encoder outputs to create this JSON

          obj_json_path = Path(IMAGE_INPUT_DIR / "detection_result.json")

          if not obj_json_path.exists():
              print(f"\n✗ Error: Object JSON not found: {obj_json_path}")
              print("Please ensure obj_json_r_mapping_stride10.json is in the working directory")
              return

          decoder = RelationshipDecoder(MODEL_NAME, log_dir)
          relationships = decoder.process_objects(obj_json_path)
          decoder.close()

    # === COMPLETE ===
    print("\n" + "=" * 80)
    print("PIPELINE COMPLETE!")
    print("=" * 80)
    print(f"\nAll outputs saved to: {log_dir}")
    print(f"  - Encoder outputs: {log_dir / 'encoder_outputs'}")
    print(f"  - Decoder outputs: {log_dir / 'decoder_outputs'}")
    print(f"  - Logs: encoder_log.txt, decoder_log.txt")

if __name__ == "__main__":
    main()

SCENE GRAPH PIPELINE

Log directory: frame00180_test/logs/log_25121001

PHASE 1: OBJECT DETECTION (ENCODER)
[ENCODER] Initializing Object Encoder...


You are using a model of type deepseek_vl_v2 to instantiate a model of type DeepseekOCR. This is not supported for all configurations of models and can yield errors.
Some weights of DeepseekOCRForCausalLM were not initialized from the model checkpoint at deepseek-ai/DeepSeek-OCR and are newly initialized: ['model.vision_model.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[ENCODER] Encoder model loaded successfully!
[ENCODER] 
Found 1 images to process
[ENCODER] 
Processing: frame00180_test/frame000180.jpg
[ENCODER] ✓ Successfully processed frame000180.jpg


NameError: name 'parsed' is not defined

Below is an instance run.