In [20]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import math
import numpy as np
import re
from shapely.geometry import Polygon, LineString, Point
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import CocoDetection
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torchvision.transforms as T
from PIL import Image
from torch.optim import SGD, Adam, Adadelta
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision import transforms

### load json annotation dictionary

In [2]:
# Load JSON data into a dictionary
with open('./data/ds2_dense/deepscores_train.json') as file:
    data1 = json.load(file)
with open('./data/ds2_dense/deepscores_test.json') as file:
    data2 = json.load(file)

In [58]:
# convert to pandas
# train_labels = pd.DataFrame( data1['categories']).T
train_images = pd.DataFrame( data1['images'])
train_obboxs = pd.DataFrame( data1['annotations']).T
# test_labels = pd.DataFrame( data2['categories']).T
test_images = pd.DataFrame( data2['images'])
test_obboxs = pd.DataFrame( data2['annotations']).T

### prepare the labels - I adjusted the json slightly so use `new_labels.csv`

In [89]:
# read in the new mapping
train_labels = pd.read_csv('new_labels.csv')
# make a df of the unique labels with their names
unique_labels = train_labels[['label', 'name']]
unique_labels = unique_labels.drop_duplicates(subset=['label'])
unique_labels = unique_labels.sort_values(by=['label']).reset_index(drop=True)
unique_labels.head(4)

Unnamed: 0,label,name
0,1,brace
1,2,ledgerLine
2,3,repeatDot
3,4,segno


### prepare the image/annotations

In [6]:
train_images.rename(columns={'id': 'img_id'}, inplace=True)
test_images.rename(columns={'id': 'img_id'}, inplace=True)
test_images.head(3)

Unnamed: 0,img_id,filename,width,height,ann_ids
0,1,lg-75827152-aug-lilyjazz-.png,1960,2772,"[160131, 160132, 160133, 160134, 160135, 16013..."
1,5,lg-210359136-aug-lilyjazz--page-14.png,1960,2772,"[503778, 503779, 503780, 503781, 503782, 50378..."
2,6,lg-366136986510816260-aug-gutenberg1939-.png,1960,2772,"[769765, 769766, 769767, 769768, 769769, 76977..."


In [7]:
# remap the class labels
class_mapping = dict(zip(test_labels['old_id'], test_labels['labels']))

# Define a function to replace each cat_id list with corresponding class names
def map_cat_ids_to_classes(cat_ids):
    return list(set([class_mapping.get(str(cat_id)) for cat_id in cat_ids]))[0]

# Apply this function to the cat_id column in train and test obboxs DataFrames
train_obboxs['labels'] = train_obboxs['cat_id'].apply(map_cat_ids_to_classes)
test_obboxs['labels'] = test_obboxs['cat_id'].apply(map_cat_ids_to_classes)

# Function to extract duration and relative position from comments
def extract_info(comment):
    duration = re.search(r'duration:(\d+);', comment)
    rel_position = re.search(r'rel_position:(-?\d+);', comment)
    return [int(duration.group(1)) if duration else None, int(rel_position.group(1)) if rel_position else None]
    
# Apply the function to create new columns
train_obboxs[['duration', 'rel_position']] = train_obboxs['comments'].apply(extract_info).tolist()
test_obboxs[['duration', 'rel_position']] = test_obboxs['comments'].apply(extract_info).tolist()
# set items with no duration to 0
train_obboxs['duration'] = train_obboxs['duration'].replace(np.nan,0)
test_obboxs['duration'] = test_obboxs['duration'].replace(np.nan,0)
# create a mask for the rel_position to mark where the rel_position is relevent
train_obboxs['rel_position_mask'] = train_obboxs['rel_position'].notna().astype(int)
test_obboxs['rel_position_mask'] = test_obboxs['rel_position'].notna().astype(int)
# set items with no rel_position to 50 (nothing has a position this high)
# we may need to reapproach this with a KNN inference
train_obboxs['rel_position'] = train_obboxs['rel_position'].replace(np.nan,50)
test_obboxs['rel_position'] = test_obboxs['rel_position'].replace(np.nan,50)

# clean up
train_obboxs.reset_index(inplace=True)
test_obboxs.reset_index(inplace=True)
train_obboxs.drop(['cat_id','comments'], axis=1, inplace=True)
test_obboxs.drop(['cat_id','comments'], axis=1, inplace=True)
train_obboxs.rename(columns={'index': 'ann_id'}, inplace=True)
test_obboxs.rename(columns={'index': 'ann_id'}, inplace=True)
train_obboxs['ann_id'] = train_obboxs['ann_id'].astype(int)
test_obboxs['ann_id'] = test_obboxs['ann_id'].astype(int)
train_obboxs['area'] = train_obboxs['area'].astype(int)
test_obboxs['area'] = test_obboxs['area'].astype(int)
train_obboxs['img_id'] = train_obboxs['img_id'].astype(int)
test_obboxs['img_id'] = test_obboxs['img_id'].astype(int)
test_obboxs.iloc[100:104]

Unnamed: 0,ann_id,a_bbox,o_bbox,area,img_id,labels,duration,rel_position,rel_position_mask
100,101,"[1466.0, 338.0, 1467.0, 413.0]","[1467.0, 413.0, 1467.0, 338.0, 1466.0, 338.0, ...",152,1180,52.0,0.0,50.0,0
101,102,"[1500.0, 211.0, 1520.0, 228.0]","[1522.0, 224.00001525878906, 1517.0, 209.00001...",271,1180,37.0,8.0,-3.0,1
102,103,"[1500.0, 318.0, 1520.0, 335.0]","[1523.0, 325.0, 1512.0, 314.0, 1497.5, 328.5, ...",275,1180,35.0,8.0,4.0,1
103,104,"[1519.0, 136.0, 1520.0, 217.0]","[1520.0, 217.0, 1520.0, 136.0, 1519.0, 136.0, ...",164,1180,52.0,0.0,50.0,0


## join the tables together to get one big table with the complete info for every annotation

In [8]:
train_data = pd.merge(train_obboxs, train_images, on='img_id', how='inner')
test_data = pd.merge(test_obboxs, test_images, on='img_id', how='inner')
train_data.drop('ann_ids', axis=1, inplace=True)
test_data.drop('ann_ids', axis=1, inplace=True)
train_data.head(4)

Unnamed: 0,ann_id,a_bbox,o_bbox,area,img_id,labels,duration,rel_position,rel_position_mask,filename,width,height
0,1020,"[116.0, 139.0, 2315.0, 206.0]","[2315.0, 206.0, 2315.0, 139.0, 116.0, 139.0, 1...",18945,679,165.0,0.0,50.0,0,lg-877777775968732096-aug-gonville--page-3.png,2431,3439
1,1021,"[116.0, 309.0, 2315.0, 376.0]","[2315.0, 376.0, 2315.0, 309.0, 116.0, 309.0, 1...",19223,679,165.0,0.0,50.0,0,lg-877777775968732096-aug-gonville--page-3.png,2431,3439
2,1022,"[1880.0, 561.0, 1911.0, 564.0]","[1911.0, 564.0, 1911.0, 561.0, 1880.0, 561.0, ...",120,679,2.0,0.0,50.0,0,lg-877777775968732096-aug-gonville--page-3.png,2431,3439
3,1023,"[1883.0, 578.0, 1911.0, 580.0]","[1911.0, 580.0, 1911.0, 578.0, 1883.0, 578.0, ...",27,679,2.0,0.0,50.0,0,lg-877777775968732096-aug-gonville--page-3.png,2431,3439


### compute the yolo bounding boxes

In [9]:
def corners_to_yolo(bbox, img_width, img_height):
    polygon = Polygon([(bbox[i], bbox[i + 1]) for i in range(0, len(bbox), 2)])
    min_rect = polygon.minimum_rotated_rectangle

    # Check if the minimum rotated rectangle is a point
    if isinstance(min_rect, Point):
        # Handle the case where the shape is a point by creating a small box around it
        x, y = min_rect.x, min_rect.y
        min_rect = Polygon([(x-1, y-1), (x+1, y-1), (x+1, y+1), (x-1, y+1)])
        return 'invalid'

    # check if symbol is a line and add 1 px padding if so (almost always stems)
    elif isinstance(min_rect, LineString):
        # Handle the case where the shape is a line by padding
        x_coords, y_coords = zip(*min_rect.coords)
        min_x, max_x = min(x_coords), max(x_coords)
        min_y, max_y = min(y_coords), max(y_coords)
        min_rect = Polygon([(min_x-1, min_y-1), (max_x+1, min_y-1), (max_x+1, max_y+1), (min_x-1, max_y+1)])

    corners = np.array(min_rect.exterior.coords)
    edge1 = np.linalg.norm(corners[0] - corners[1])
    edge2 = np.linalg.norm(corners[1] - corners[2])
    width = max(edge1, edge2)
    height = min(edge1, edge2)
    center = min_rect.centroid.coords[0]
    center_x = center[0]
    center_y = center[1]
    angle = np.rad2deg(np.arctan2(corners[1][1] - corners[0][1], corners[1][0] - corners[0][0]))

    center_x /= img_width
    center_y /= img_height
    width /= img_width
    height /= img_height

    return [center_x, center_y, width, height, angle]
    
# Function to convert corners to YOLO format for each row in the DataFrame
def apply_corners_to_yolo(row):
    return corners_to_yolo(row['o_bbox'], row['width'], row['height'])

In [10]:
# Add a column with bounding boxes in (center x, center y, W, H, R)*normalized format
train_data['yolo_bbox'] = train_data.apply(apply_corners_to_yolo, axis=1)
test_data['yolo_bbox'] = test_data.apply(apply_corners_to_yolo, axis=1)
# drop invalid boxes
train_data = train_data[train_data['yolo_bbox']!='invalid']
test_data = test_data[test_data['yolo_bbox']!='invalid']
train_data.head(1)

Unnamed: 0,ann_id,a_bbox,o_bbox,area,img_id,labels,duration,rel_position,rel_position_mask,filename,width,height,yolo_bbox
0,1020,"[116.0, 139.0, 2315.0, 206.0]","[2315.0, 206.0, 2315.0, 139.0, 116.0, 139.0, 1...",18945,679,165.0,0.0,50.0,0,lg-877777775968732096-aug-gonville--page-3.png,2431,3439,"[0.5, 0.05015993021227101, 0.904566022213081, ..."


In [11]:
train_data_agg = train_data.groupby('filename').agg({
    'ann_id': lambda x: list(x),
    'a_bbox': lambda x: list(x),
    'o_bbox': lambda x: list(x),
    'area': lambda x: list(x),
    'duration': lambda x: list(x),
    'rel_position': lambda x: list(x), 
    'labels': lambda x: list(x),
    'img_id': 'first',  # assuming all entries per image have the same img_id
    'width': 'first',   # assuming all entries per image have the same width
    'height': 'first'  # assuming all entries per image have the same height
}).reset_index()

In [12]:
train_data_agg.head(1)

Unnamed: 0,filename,ann_id,a_bbox,o_bbox,area,duration,rel_position,labels,img_id,width,height
0,lg-101766503886095953-aug-beethoven--page-1.png,"[632316, 632317, 632318, 632319, 632320, 63232...","[[233.0, 376.0, 1866.0, 443.0], [233.0, 833.0,...","[[1866.0, 443.0, 1866.0, 376.0, 233.0, 376.0, ...","[13425, 9986, 73, 75, 12613, 10958, 105, 269, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.0, 0.0, ...","[50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, -2....","[165.0, 165.0, 2.0, 2.0, 165.0, 165.0, 52.0, 3...",142,1960,2772


## Prepare the Torch dataset object

In [40]:
class MusicScoreDataset(Dataset):
    def __init__(self, dataframe, root_dir, transform=None):
        """
        Args:
            dataframe (DataFrame): Pandas DataFrame containing annotations.
            root_dir (string): Directory with all the images.
            transforms (callable, optional): Optional transform to be applied on a sample.
        """
        self.annotations = dataframe
        self.root_dir = root_dir
        # Set default transforms if none are provided
        if transform is None:
            self.transform = transforms.Compose([
                transforms.ToTensor(),  # Convert images to tensors
                # transforms.Normalize(mean=[0.485], std=[0.229])  # Adjust if your image is not RGB
            ])
        else:
            self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.annotations['filename'].iloc[idx])
        image = Image.open(img_name).convert("L") # use grayscale
        if self.transform:
            image = self.transform(image)
        boxes = torch.as_tensor(self.annotations['a_bbox'].iloc[idx], dtype=torch.float32)
        durations = torch.as_tensor(self.annotations['duration'].iloc[idx], dtype=torch.float32)
        rel_positions = torch.as_tensor(self.annotations['rel_position'].iloc[idx], dtype=torch.float32)
        labels = torch.as_tensor(self.annotations['labels'].iloc[idx], dtype=torch.int64)
        image_id = torch.tensor([self.annotations['img_id'].iloc[idx]], dtype=torch.int64)
        area = torch.as_tensor([self.annotations['area'].iloc[idx]], dtype=torch.float32)
        iscrowd = torch.zeros((len(boxes),), dtype=torch.int64)  # Assuming no crowd

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["durations"] = durations
        target["rel_positions"] = rel_positions
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        return image, target

In [41]:
dataset = MusicScoreDataset(train_data_agg, './data/ds2_dense/images/')

In [42]:
dataset.__getitem__(0)

(tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]]]),
 {'boxes': tensor([[ 233.,  376., 1866.,  443.],
          [ 233.,  833., 1866.,  899.],
          [1633.,  587., 1666.,  589.],
          ...,
          [ 732., 2267.,  752., 2284.],
          [ 751., 2187.,  752., 2272.],
          [ 752., 2188.,  769., 2234.]]),
  'labels': tensor([165, 165,   2,   2, 165, 165,  52,  35, 100,  37,  52,  41,  42,  35,
           52,  52,  52,  35,  37,  37,  35,  58,  52,  52, 146,  52,  52,  35,
           37,  37,  37,  58,  52, 100,  37,  52,  37,  35,  35,  37,  52, 100,
           37,  58,  52, 146,  58,  52,  42,  37,  58,  52,  37,  35,  35, 100,
          100,  99,  58,  52,  52,  37,  35,  35,  35,  52,  37,  52,  52,  41,
          100,  99,  58,  52,  35,  35,  35,  41,  58,

## prepare the model- we will use faster r-cnn resnet50

In [43]:
def get_model(num_classes):
    # Load a model pre-trained on COCO
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    
    # Replace the classifier with a new one, that has
    # num_classes which is user-defined
    num_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(num_features, num_classes)
    
    return model

## train the model

In [44]:
def train_model(model, data_loader, optimizer, num_epochs=10):
    model.train()
    # note there is no loss function defined because the model deos it for us
    for epoch in range(num_epochs):
        for images, targets in data_loader:
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
        print(f"Epoch {epoch+1} of {num_epochs}, Loss: {losses.item()}")

In [45]:
# Setup
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
num_classes = len(unique_labels) + 1  # Define the number of classes including background
model = get_model(num_classes).to(device)
data_loader = DataLoader(MusicScoreDataset(train_data_agg, './data/ds2_dense/images/'), batch_size=2, shuffle=True)
optimizer = SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)

In [46]:
# Start training
train_model(model, data_loader, optimizer)

RuntimeError: stack expects each tensor to be equal size, but got [458, 4] at entry 0 and [564, 4] at entry 1