In [1]:
# Just so that you don't have to restart the notebook with every change.
%load_ext autoreload
%autoreload 2 

In [2]:
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
import torch.nn.functional as F
import torch

from PIL import Image
from collections import deque, Counter
from common import utils
from ultralytics import YOLO
from transformers import AutoImageProcessor

# Useful constants
CURRENT_DIR = os.getcwd()
IMAGES_DIR = os.path.join(CURRENT_DIR, "images")
VIDEOS_DIR = os.path.join(CURRENT_DIR, "videos")
CHORD_CLASSIFIER_MODEL_DIR = os.path.join(CURRENT_DIR, "chord-classifier-model")
FRETBOARD_RECOGNIZER_MODEL_DIR = os.path.join(CURRENT_DIR, "fretboard-recognizer-model")

chord_clf_model_path = utils.find_files(CHORD_CLASSIFIER_MODEL_DIR, [".safetensors", ".pt"])
chord_clf_config_path = utils.find_files(CHORD_CLASSIFIER_MODEL_DIR, [".json"])
fretboard_rec_model_path = utils.find_files(FRETBOARD_RECOGNIZER_MODEL_DIR, [".safetensors", ".pt"])
fretboard_rec_config_path = utils.find_files(FRETBOARD_RECOGNIZER_MODEL_DIR, [".json"])

utils.ensure_files_exist(
    chord_clf_model_path,
    fretboard_rec_model_path,
    chord_clf_config_path,
    fretboard_rec_config_path,
    names=[
        "Chord Classifier model",
        "Fretboard Recognizer model",
        "Chord Classifier config",
        "Fretboard Recognizer config",
    ],
)

  from .autonotebook import tqdm as notebook_tqdm


Chord Classifier model found at /home/dhimitriosduka/Documents/UdS/SoSe 2024/High-Level Computer Vision/Assignments/hlcv/Project/src/video-to-chords-pipeline/chord-classifier-model/model.safetensors
Fretboard Recognizer model found at /home/dhimitriosduka/Documents/UdS/SoSe 2024/High-Level Computer Vision/Assignments/hlcv/Project/src/video-to-chords-pipeline/fretboard-recognizer-model/yolov9c_trained_with_head.pt
Chord Classifier config found at /home/dhimitriosduka/Documents/UdS/SoSe 2024/High-Level Computer Vision/Assignments/hlcv/Project/src/video-to-chords-pipeline/chord-classifier-model/config.json
Fretboard Recognizer config not found


In [7]:
# Load Chord Classifier model
chord_clf_model = utils.load_model(chord_clf_model_path, config_path=chord_clf_config_path)
chord_clf_model.eval()

# Load Fretboard Recognizer model
fretboard_rec_model = utils.load_model(fretboard_rec_model_path, config_path=fretboard_rec_config_path, custom_class=YOLO)

print("Models loaded successfully.")

Models loaded successfully.


In [19]:
def process_video(
        video_path,
        chord_clf_model=None, 
        feature_extractor=None,
        fretboard_rec_model=None
):
    # Open the video file
    cap = cv2.VideoCapture(video_path)

    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    print(f"Video FPS: {fps}")

    recent_classifications = deque(maxlen=fps)

    all_chords = []
    
    current_frame = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        current_frame += 1

        # Convert BGR to RGB
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Convert to PIL Image
        pil_image = np.array(Image.fromarray(rgb_frame))

        pil_image = utils.extract_box_object(
            pil_image, fretboard_rec_model, class_name="fretboard", conf=0.25, expand_percent=15
        )

        # # Optional: Display the cropped image
        # plt.imshow(pil_image)
        # plt.show()
        # break

        # Preprocess the image
        inputs = feature_extractor(images=pil_image, return_tensors="pt")

        # Perform inference
        with torch.no_grad():
            outputs = chord_clf_model(**inputs)

        probabilities = F.softmax(outputs.logits, dim=-1)

        # Get the predicted class
        predicted_class_idx = probabilities.argmax(-1).item()
        predicted_class = chord_clf_model.config.id2label[predicted_class_idx]
        print(f"Frame {current_frame}: Predicted class: {predicted_class}")

        all_chords.append(predicted_class)

        # Add the prediction to recent classifications
        recent_classifications.append(predicted_class)

        # If we have collected enough frames, determine the most common classification
        if len(recent_classifications) == fps:
            print(recent_classifications)
            most_common_class = Counter(recent_classifications).most_common(1)[0][0]
            print(f"Frame {current_frame}: Most common classification in last {fps} frames: {most_common_class}")
            recent_classifications.clear()
        
        # Optional: Print progress
        if current_frame % 100 == 0:
            print(f"Processed {current_frame}/{current_frame} frames")
    
    cap.release()

    print("Video processing complete.")
    print(f"All chords detected: {all_chords}")
    return all_chords

In [20]:
video_path = "R.mp4"

feature_extractor = AutoImageProcessor.from_pretrained("facebook/dinov2-large")

chords = process_video(
    video_path,
    chord_clf_model=chord_clf_model,
    feature_extractor=feature_extractor,
    fretboard_rec_model=fretboard_rec_model 
)

chords

Video FPS: 30

0: 640x640 1 person, 5 fretboards, 673.3ms
Speed: 0.0ms preprocess, 673.3ms inference, 5.6ms postprocess per image at shape (1, 3, 640, 640)
Frame 1: Predicted class: G

0: 640x640 1 person, 1 tie, 4 fretboards, 679.1ms
Speed: 0.0ms preprocess, 679.1ms inference, 3.1ms postprocess per image at shape (1, 3, 640, 640)
Frame 2: Predicted class: D

0: 640x640 1 person, 1 tie, 4 fretboards, 651.5ms
Speed: 0.0ms preprocess, 651.5ms inference, 5.3ms postprocess per image at shape (1, 3, 640, 640)
Frame 3: Predicted class: C

0: 640x640 1 person, 1 tie, 4 fretboards, 675.4ms
Speed: 0.0ms preprocess, 675.4ms inference, 3.2ms postprocess per image at shape (1, 3, 640, 640)
Frame 4: Predicted class: C

0: 640x640 1 person, 4 fretboards, 670.6ms
Speed: 0.0ms preprocess, 670.6ms inference, 3.3ms postprocess per image at shape (1, 3, 640, 640)
Frame 5: Predicted class: C

0: 640x640 1 person, 1 tie, 6 fretboards, 668.0ms
Speed: 0.0ms preprocess, 668.0ms inference, 5.3ms postprocess pe

KeyboardInterrupt: 