In [1]:
import pretty_midi
import math
import cv2
import os
import matplotlib.pyplot as plt
import random
import numpy as np
%matplotlib inline

import torch
import torchvision.transforms as transforms

from model import Classifier

In [2]:
# # Create a PrettyMIDI object
# cello_c_chord = pretty_midi.PrettyMIDI()

# # Create an Instrument instance for a cello instrument
# piano_program = pretty_midi.instrument_name_to_program('Piano')
# cello = pretty_midi.Instrument(program=piano_program)

# # Iterate over note names, which will be converted to note number later
# for note_name in ['C5', 'E5', 'G5']:
#     # Retrieve the MIDI note number for this note name
#     note_number = pretty_midi.note_name_to_number(note_name)
#     # Create a Note instance, starting at 0s and ending at .5s
#     note = pretty_midi.Note(
#         velocity=100, pitch=note_number, start=0, end=.5)
#     # Add it to our cello instrument
#     cello.notes.append(note)
# # Add the cello instrument to the PrettyMIDI object
# cello_c_chord.instruments.append(cello)
# # Write out the MIDI data
# cello_c_chord.write('cello-C-chord.mid')

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
# load model
model = Classifier(encoder='resnet18')
model.load_state_dict(torch.load('resnet18_30.pt'))
model.to(device)
model.eval()

Classifier(
  (encoder): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_

In [5]:
# Define transforms for the images
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

In [13]:
# Open the video file
video_capture = cv2.VideoCapture('/home/dimang/Documents/PhD/Courses/02501 - Advanced deep learning in computer vision/AISynthesizer/src/simple_cnn/video/track_2.mid.mp4')

# Check if the video file opened successfully
if not video_capture.isOpened():
    print("Error: Couldn't open the video file.")
    exit()

# Iterate through frames
frame_count = 0
predictions = []
while True:
    # Read the next frame
    ret, frame = video_capture.read()

    # Check if the frame was read successfully
    if not ret:
        break

    cv2.imshow('Frame', frame)

    with torch.no_grad():
        output = model(transform(frame).to(device).unsqueeze(dim=0))
        sigmoid_output = torch.sigmoid(output).cpu()
        predicted = np.round(sigmoid_output)
        predictions.append(predicted)
        

    # Wait for the 'q' key to exit
    if cv2.waitKey(25) & 0xFF == ord('q'):
        break

    frame_count += 1
    

# Release the video capture object and close any open windows
video_capture.release()
cv2.destroyAllWindows()

print("Total frames:", frame_count)


Total frames: 125


In [12]:
for pred in predictions:
    print(np.where(np.array(pred)==1)[1])

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[2]
[]
[2]
[2]
[2]
[2]
[2]
[2]
[]
[2]
[2]
[2]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[56]
[56]
[56]
[56]
[56]
[56]
[56]
[56]
[56]
[56]
[56]
[56]
[56]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[80]
[80]
[80]
[80]
[80]
[80]
[80]
[80]
[80]
[80]
[80]
[80]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


In [None]:


# Assuming the Classifier class has been defined as per your provided code

def process_video_to_midi(video_path, model, device):
    # Define transformations for the video frames
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),  # Ensure this matches your model input size
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])
    
    # Load the video
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video file")
        return

    # Prepare MIDI file
    midi_file = pretty_midi.PrettyMIDI()
    piano_program = pretty_midi.instrument_name_to_program('Acoustic Grand Piano')
    piano = pretty_midi.Instrument(program=piano_program)

    # Process each frame
    frame_rate = cap.get(cv2.CAP_PROP_FPS)
    current_time = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Transform the frame for model prediction
        frame_tensor = transform(frame).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(frame_tensor)
            predicted = torch.sigmoid(output).cpu().numpy()
        
        # Assuming the model output is binary or near-binary
        key_indices = np.where(predicted >= 0.5)[1]
        for idx in key_indices:
            note_number = idx + 21  # Adjust based on your specific key mapping
            note = pretty_midi.Note(
                velocity=100, pitch=note_number, start=current_time, end=current_time + 1/frame_rate)
            piano.notes.append(note)
        
        current_time += 1/frame_rate
    
    # Add the piano instrument to the MIDI object and save the MIDI file
    midi_file.instruments.append(piano)
    midi_file.write('output.mid')

    # Release resources
    cap.release()

# Example of usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Classifier(num_classes=91, encoder='resnet18', pretrained=True)
model.to(device)
model.eval()

video_path = 'path_to_your_video.mp4'
process_video_to_midi(video_path, model, device)
