In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
import cv2
import torch
import torchvision.transforms as transforms
from torchvision import models
from torchvision.models import ResNet18_Weights
import numpy as np
import json
import requests

# Load a pretrained model (ResNet18 in this case)
model = models.resnet18(weights=ResNet18_Weights.DEFAULT)
model.eval()  # Set the model to evaluation mode

# Define the transformation pipeline for the frames
transform = transforms.Compose([
    transforms.ToPILImage(),                # Convert numpy array (OpenCV frame) to PIL image
    transforms.Resize((224, 224)),          # Resize to model input size
    transforms.ToTensor(),                  # Convert PIL image to Tensor
    transforms.Normalize(                   # Normalize as per model requirements
        mean=[0.485, 0.456, 0.406],         # Mean for ImageNet
        std=[0.229, 0.224, 0.225]           # Std deviation for ImageNet
    ),
])

# Load ImageNet class labels
url = "https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json"
imagenet_classes = json.loads(requests.get(url).text)

# Open webcam feed
cap = cv2.VideoCapture(1) # my webcam is on index 1

while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Could not read frame.")
        break

    # Preprocess the frame for classification
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
    input_tensor = transform(rgb_frame).unsqueeze(0)    # Add batch dimension

    # Perform inference
    with torch.no_grad():
        outputs = model(input_tensor)
        _, predicted_idx = torch.max(outputs, 1)        # Get the index of the highest probability
        predicted_class = imagenet_classes[predicted_idx.item()]  # Map index to class label

    # flip the frame horizontally for a more intuitive view
    frame = cv2.flip(frame, 1)
    
    # Display the prediction on the frame
    cv2.putText(frame, f"Predicted: {predicted_class}", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Show the frame
    cv2.imshow("Image Classification", frame)

    # Exit the loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam and close all windows
cap.release()
cv2.destroyAllWindows()


In [11]:
import cv2
import torch
import torchvision.transforms as transforms
from torchvision import models
from torchvision.models import ResNet18_Weights
import numpy as np
import json
import requests

# Check if CUDA (GPU) is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load a pretrained ResNet18 model and move it to the appropriate device (GPU/CPU)
model = models.resnet18(weights=ResNet18_Weights.DEFAULT)
model.to(device)  # Move model to GPU (if available)
model.eval()  # Set the model to evaluation mode

# Define the transformation pipeline for the frames
transform = transforms.Compose([
    transforms.ToPILImage(),                # Convert numpy array (OpenCV frame) to PIL image
    transforms.Resize((224, 224)),          # Resize to model input size
    transforms.ToTensor(),                  # Convert PIL image to Tensor
    transforms.Normalize(                   # Normalize as per model requirements
        mean=[0.485, 0.456, 0.406],         # Mean for ImageNet
        std=[0.229, 0.224, 0.225]           # Std deviation for ImageNet
    ),
])

# Load ImageNet class labels
url = "https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json"
imagenet_classes = json.loads(requests.get(url).text)

# Open webcam feed
cap = cv2.VideoCapture(1)  # my webcam is on index 1, you may need to change this

while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Could not read frame.")
        break

    # Preprocess the frame for classification
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
    input_tensor = transform(rgb_frame).unsqueeze(0)    # Add batch dimension

    # Move the input tensor to the same device as the model (GPU or CPU)
    input_tensor = input_tensor.to(device)

    # Perform inference
    with torch.no_grad():
        outputs = model(input_tensor)
        _, predicted_idx = torch.max(outputs, 1)        # Get the index of the highest probability
        predicted_class = imagenet_classes[predicted_idx.item()]  # Map index to class label

    # flip the frame horizontally for a more intuitive view
    frame = cv2.flip(frame, 1)

    # Display the prediction on the frame
    cv2.putText(frame, f"Predicted: {predicted_class}", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Show the frame
    cv2.imshow("Image Classification", frame)

    # Exit the loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam and close all windows
cap.release()
cv2.destroyAllWindows()
