In [None]:
import cv2 as cv
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

In [None]:
# CNN example
class CNN(nn.Module):
    def __init__(self, learning_rate=0.001, batch_size=32):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 48, kernel_size=3) # 1st convolution
        self.conv2 = nn.Conv2d(48, 48, kernel_size=3) # 2nd convolution
        self.pool = nn.MaxPool2d(3, 3) # Pooling - condensing down to weighted sum of a region
        self.fc1 = nn.Linear(48 * 4 * 4, 144) # Adjusted output size
        self.fc2 = nn.Linear(144, 48) # Corrected input size to match the output of self.fc1
        self.out = nn.Linear(48, 24)
        self.learning_rate = learning_rate
        self.batch_size = batch_size

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.reshape(-1, 48 * 4 * 4) # Corrected reshaping based on the calculated dimensions
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return F.softmax(self.out(x), dim=1)

In [None]:
def train():
    # Hyperparameters
    learning_rate = 0.001
    batch_size = 32
    num_epochs = 5

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # Data transformation: Resize to 48x48 to match the CNN's expected input
    transform = transforms.Compose([
        transforms.Resize((48, 48)),
        transforms.ToTensor(),
    ])

    # Load MNIST dataset (grayscale images)
    train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
    test_dataset  = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Instantiate the model
    model = CNN(learning_rate=learning_rate, batch_size=batch_size)
    # Adjust the final layer to match MNIST's 10 classes:
    model.out = nn.Linear(48, 10)

    # Optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    # For training, we need raw logits (since CrossEntropyLoss applies softmax internally).
    # Thus, we define a forward function without the softmax:
    def forward_without_softmax(x):
        x = F.relu(model.conv1(x))
        x = model.pool(x)
        x = F.relu(model.conv2(x))
        x = model.pool(x)
        x = x.view(-1, 48 * 4 * 4)
        x = F.relu(model.fc1(x))
        x = F.relu(model.fc2(x))
        return model.out(x)

    # Training loop
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch_idx, (images, labels) in enumerate(train_loader):
            optimizer.zero_grad() # zero out gradient after each iteration
            outputs = forward_without_softmax(images) # raw logits
            loss = criterion(outputs, labels) # calculate gradient
            loss.backward() # use loss gradient
            optimizer.step() # re-eval model and return loss
            running_loss += loss.item()
            if (batch_idx + 1) % 100 == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}")
        print(f"Epoch [{epoch+1}] Average Loss: {running_loss/len(train_loader):.4f}")

    # Evaluation on the test set
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            outputs = forward_without_softmax(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy: {100 * correct / total:.2f}%")

if __name__ == '__main__':
    train()


In [None]:
# OpenCV functions
# Edge detection
def show_canny(img):
    canny = cv.Canny(cv.imread(img), 100, 200)
    cv.imshow(img, canny)
    cv.waitKey(0)
    cv.destroyWindow(img)

# Show image
def show_img(img):
    cv.imshow(img, cv.imread(img))
    cv.waitKey(0)
    cv.destroyWindow(img)

# Grayscale
def show_grayscale(img):
    gray = cv.cvtColor(cv.imread(img), cv.COLOR_BGR2GRAY)
    cv.imshow(img, gray)
    cv.waitKey(0)
    cv.destroyWindow(img)

def show_video():
    import cv2

    # Open video file or webcam (0 for default webcam)
    cap = cv2.VideoCapture("video.mp4")  # Change to 0 for webcam

    # Get the original frame rate (FPS)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    print(f"Original FPS: {fps}")

    # Set a delay between frames to match the original FPS
    frame_delay = int(1000 / fps)  # Convert FPS to milliseconds

    while cap.isOpened():
        ret, frame = cap.read()

        if not ret:
            print("End of video stream or cannot fetch frame.")
            break

        # Show the frame
        cv2.imshow("Video", frame)

        # Wait for 'q' key or for the frame delay
        if cv2.waitKey(frame_delay) & 0xFF == ord('q'):
            break

    # Release resources
    cap.release()
    cv2.destroyAllWindows()


In [None]:
# OpenCV to PIL Image
import cv2
from PIL import Image
import numpy as np

def opencv_to_pil(opencv_image):
    # Convert the OpenCV image (BGR) to RGB
    rgb_image = cv2.cvtColor(opencv_image, cv2.COLOR_BGR2RGB)

    # Convert the RGB image to a PIL Image object
    pil_image = Image.fromarray(rgb_image)

    return pil_image

# Example usage:
# Assuming 'image_path' is the path to your image file
image_path = 'path/to/your/image.jpg'
opencv_image = cv2.imread(image_path)

if opencv_image is not None:
    pil_image = opencv_to_pil(opencv_image)

    # You can now work with the PIL image, e.g., display it
    pil_image.show()

    # Or save it
    pil_image.save("path/to/save/pil_image.png")
else:
    print(f"Error: Could not open or read the image from {image_path}")

In [None]:
import cv2
import numpy as np

# Load the image using OpenCV
image = cv2.imread('image.jpg')

# Check if the image was loaded successfully
if image is None:
    raise Exception("Could not open or find the image")

# Convert the image to a NumPy array
numpy_image = np.array(image)

# Print the shape of the NumPy array (height, width, channels)
print(numpy_image.shape)

# Print the data type of the NumPy array
print(numpy_image.dtype)

In [None]:
# Case study

import cv2
import torch
import torchvision.transforms as transforms
from torchvision import models
import numpy as np

# Load a pre-trained ResNet18 model and set it to evaluation mode
model = models.resnet18(pretrained=True)
model.eval()

# Define the image preprocessing steps expected by ResNet18
preprocess = transforms.Compose([
    transforms.ToPILImage(),               # Convert OpenCV image (NumPy array) to PIL image
    transforms.Resize(256),                # Resize the image so the shorter side is 256 pixels
    transforms.CenterCrop(224),            # Crop the center 224x224 portion
    transforms.ToTensor(),                 # Convert PIL image to Tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # Normalize using ImageNet's mean and std
                         std=[0.229, 0.224, 0.225])
])

# Load the class names from the file
with open("imagenet_classes.txt") as f:
    classes = [line.strip() for line in f.readlines()]

# Start video capture using OpenCV (0 is usually the default webcam)
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Convert frame from BGR (OpenCV default) to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Preprocess the frame and add a batch dimension
    input_tensor = preprocess(rgb_frame)
    input_batch = input_tensor.unsqueeze(0)

    # Run the model in inference mode
    with torch.no_grad():
        output = model(input_batch)

    # Find the predicted class index and corresponding label
    _, predicted_idx = torch.max(output, 1)
    predicted_label = classes[predicted_idx]

    # Overlay the predicted label on the frame
    cv2.putText(frame, predicted_label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX,
                1, (0, 255, 0), 2, cv2.LINE_AA)

    # Display the annotated frame
    cv2.imshow('Real-Time Classification', frame)

    # Press 'q' to exit the loop
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
