In [1]:
# Ensure you have a CUDA-compatible version of PyTorch installed.
!pip install -q ultralytics timm opencv-python Pillow

# Load CNN model

In [2]:
from ultralytics import YOLO

cnn = YOLO("models/cnn.pt")

# Load ViT model and move to GPU

In [3]:
import timm
import torch

# MODIFIED: Define the device to use (CUDA or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# MODIFIED: Create the model and move it to the defined device
vit = timm.create_model('eva02_base_patch14_224.mim_in22k', pretrained=False, num_classes=8).to(device)

# MODIFIED: Load the model weights, mapping them to the same device
vit.load_state_dict(torch.load("models/vit.pth", map_location=device))

# MODIFIED: Set to evaluation mode (no .cpu() needed)
vit.eval()

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Eva(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (rope): RotaryEmbeddingCat()
  (blocks): ModuleList(
    (0-11): 12 x EvaBlock(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): EvaAttention(
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): SwiGLU(
        (fc1_g): Linear(in_features=768, out_features=2048, bias=True)
        (fc1_x): Linear(in_features=76

In [4]:
from torchvision import transforms

vit_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# Load labels

In [5]:
with open('models/labels.txt') as f:
    labels = {i: line.strip().split(' ', 1)[1] for i, line in enumerate(f)}

print(labels)

{0: 'Kek Lapis', 1: 'Kuih Kaswi Pandan', 2: 'Kuih Ketayap', 3: 'Kuih Lapis', 4: 'Kuih Seri Muka', 5: 'Kuih Talam', 6: 'Kuih Ubi Kayu', 7: 'Onde-Onde'}


# NEW: Live Webcam Inference on GPU

In [6]:
import cv2
from PIL import Image
import torch.nn.functional as F
from collections import Counter
import numpy as np

num_classes = 8

# Start webcam
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

while True:
    # Read a frame from the webcam
    ret, frame = cap.read()
    if not ret:
        print("Error: Failed to capture frame.")
        break

    # --- 1. CNN Prediction (on GPU) ---
    # YOLO automatically uses the GPU if available
    cnn_results = cnn(frame, verbose=False)
    cnn_pred_classes = cnn_results[0].boxes.cls.tolist()

    if len(cnn_pred_classes) > 0:
        cnn_counts = Counter(cnn_pred_classes)
        cnn_probs = [cnn_counts.get(i, 0) for i in range(num_classes)]
        cnn_probs = [p / sum(cnn_probs) for p in cnn_probs]
        cnn_predicted_class_idx = int(max(cnn_counts, key=cnn_counts.get))
    else:
        cnn_probs = [0.0] * num_classes
        cnn_predicted_class_idx = -1

    # --- 2. ViT Prediction (on GPU) ---
    image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(image_rgb)
    
    # MODIFIED: Move the input tensor to the same device as the model
    input_tensor = vit_transform(pil_image).unsqueeze(0).to(device)

    with torch.no_grad():
        vit_logits = vit(input_tensor)
    
    # Probabilities can be moved to CPU for subsequent logic
    vit_probs_tensor = F.softmax(vit_logits, dim=1).squeeze(0).cpu()
    vit_probs = vit_probs_tensor.tolist()
    vit_predicted_class_idx = int(torch.argmax(vit_probs_tensor).item())

    # --- 3. Combine Predictions (Ensemble) ---
    final_probs = [(c + v) / 2 for c, v in zip(cnn_probs, vit_probs)]

    if cnn_predicted_class_idx == vit_predicted_class_idx and cnn_predicted_class_idx != -1:
        final_idx = cnn_predicted_class_idx
    elif cnn_predicted_class_idx == -1 and vit_predicted_class_idx != -1:
        final_idx = vit_predicted_class_idx
    elif vit_predicted_class_idx == -1 and cnn_predicted_class_idx != -1:
        final_idx = cnn_predicted_class_idx
    elif cnn_predicted_class_idx != -1 and vit_predicted_class_idx != -1:
        final_idx = int(torch.tensor(final_probs).argmax().item())
    else:
        final_idx = -1

    final_label = labels.get(final_idx, 'Unknown')

    # --- 4. Display the result on the frame ---
    font = cv2.FONT_HERSHEY_SIMPLEX
    cv2.putText(frame, f'Prediction: {final_label}', (10, 30), font, 1, (0, 255, 0), 2, cv2.LINE_AA)
    cv2.imshow('Webcam Live Inference (CUDA)', frame)

    # Exit loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# --- 5. Cleanup ---
cap.release()
cv2.destroyAllWindows()


error: OpenCV(4.11.0) D:\a\opencv-python\opencv-python\opencv\modules\highgui\src\window.cpp:1301: error: (-2:Unspecified error) The function is not implemented. Rebuild the library with Windows, GTK+ 2.x or Cocoa support. If you are on Ubuntu or Debian, install libgtk2.0-dev and pkg-config, then re-run cmake or configure script in function 'cvShowImage'
