In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import math
import numpy as np
import re
from shapely.geometry import Polygon, LineString, Point
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import CocoDetection
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torchvision.transforms as T
from torch.optim import SGD, Adam, Adadelta
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision import transforms
from torch.utils.data._utils.collate import default_collate
import torchvision
from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
from torchvision.transforms import functional as F
from PIL import Image, ImageDraw, ImageFont, ImageFilter
import random
from math import radians, cos, sin
import cv2
import ast

### load json annotation dictionary

In [2]:
# Load JSON data into a dictionary
with open('./data/ds2_dense/deepscores_train.json') as file:
    data1 = json.load(file)
with open('./data/ds2_dense/deepscores_test.json') as file:
    data2 = json.load(file)

In [3]:
# convert to pandas
# train_labels = pd.DataFrame( data1['categories']).T
train_images = pd.DataFrame( data1['images'])
train_obboxs = pd.DataFrame( data1['annotations']).T
# test_labels = pd.DataFrame( data2['categories']).T
test_images = pd.DataFrame( data2['images'])
test_obboxs = pd.DataFrame( data2['annotations']).T

### prepare the labels - I adjusted the json slightly so use `new_labels.csv`

In [4]:
# read in the new mapping
raw_labels = pd.read_csv('new_labels.csv')
raw_labels.head()

Unnamed: 0,old_index,old_id,name,dataset,color,label
0,0,1,brace,deepscores,1,1
1,1,137,brace,muscima++,1,1
2,2,2,ledgerLine,deepscores,2,2
3,3,138,ledgerLine,muscima++,2,2
4,4,3,repeatDot,deepscores,7,3


In [5]:
# make a df of the unique labels with their names
unique_labels = raw_labels[['label', 'name']]
unique_labels = unique_labels.drop_duplicates(subset=['label'])
unique_labels = unique_labels.sort_values(by=['label']).reset_index(drop=True)
unique_labels.head(4)

Unnamed: 0,label,name
0,1,brace
1,2,ledgerLine
2,3,repeatDot
3,4,segno


### prepare the images/annotations

In [6]:
train_images.rename(columns={'id': 'img_id'}, inplace=True)
test_images.rename(columns={'id': 'img_id'}, inplace=True)
test_images.head(3)

Unnamed: 0,img_id,filename,width,height,ann_ids
0,1,lg-75827152-aug-lilyjazz-.png,1960,2772,"[160131, 160132, 160133, 160134, 160135, 16013..."
1,5,lg-210359136-aug-lilyjazz--page-14.png,1960,2772,"[503778, 503779, 503780, 503781, 503782, 50378..."
2,6,lg-366136986510816260-aug-gutenberg1939-.png,1960,2772,"[769765, 769766, 769767, 769768, 769769, 76977..."


#### the data has multiple labels for each object but we only need one. some of the object classes were redundant, so I mapped these to the same label in the new_labels.csv file. Some of the labels are very similar but have slightly different contextual information, so in cases where this occurs, we will select the label with the most information. I have set ordered the raw_labels.csv file so that the higher precedence item has a higher label value. Therefore, all we need to do is select the label with the highest number.

In [7]:
# remap the class labels
class_mapping = dict(zip(raw_labels['old_id'].astype(str), raw_labels['label']))

# Define a function to replace each cat_id list with corresponding class names
def map_cat_ids_to_classes(cat_ids):
    return [class_mapping.get(str(cat_id)) for cat_id in cat_ids]

def clean_labels(label_list):
    # Use a set comprehension to remove duplicates and filter out None values
    return list({label for label in label_list if label is not None})
    
def select_highest_precedence(label_list):
    return max(label_list)

# Apply this function to the cat_id column in train and test obboxs DataFrames
train_obboxs['label'] = train_obboxs['cat_id'].apply(map_cat_ids_to_classes)
test_obboxs['label'] = test_obboxs['cat_id'].apply(map_cat_ids_to_classes)
train_obboxs['label'] = train_obboxs['label'].apply(clean_labels)
test_obboxs['label'] = test_obboxs['label'].apply(clean_labels)
train_obboxs['label'] = train_obboxs['label'].apply(select_highest_precedence)
test_obboxs['label'] = test_obboxs['label'].apply(select_highest_precedence)
train_obboxs.head()

Unnamed: 0,a_bbox,o_bbox,cat_id,area,img_id,comments,label
1020,"[116.0, 139.0, 2315.0, 206.0]","[2315.0, 206.0, 2315.0, 139.0, 116.0, 139.0, 1...","[135, 208]",18945,679,instance:#000010;,155
1021,"[116.0, 309.0, 2315.0, 376.0]","[2315.0, 376.0, 2315.0, 309.0, 116.0, 309.0, 1...","[135, 208]",19223,679,instance:#000021;,155
1022,"[1880.0, 561.0, 1911.0, 564.0]","[1911.0, 564.0, 1911.0, 561.0, 1880.0, 561.0, ...","[2, 138]",120,679,instance:#000022;,2
1023,"[1883.0, 578.0, 1911.0, 580.0]","[1911.0, 580.0, 1911.0, 578.0, 1883.0, 578.0, ...","[2, 138]",27,679,instance:#000023;,2
1024,"[1827.0, 561.0, 1857.0, 564.0]","[1857.0, 564.0, 1857.0, 561.0, 1827.0, 561.0, ...","[2, 138]",112,679,instance:#000024;,2


#### now we can extract the relative position and duration information. as far as I can tell, this only applies to notes

In [8]:
# Function to extract duration and relative position from comments
def extract_info(comment):
    duration = re.search(r'duration:(\d+);', comment)
    rel_position = re.search(r'rel_position:(-?\d+);', comment)
    return [int(duration.group(1)) if duration else None, int(rel_position.group(1)) if rel_position else None]
    
# Apply the function to create new columns
train_obboxs[['duration', 'rel_position']] = train_obboxs['comments'].apply(extract_info).tolist()
test_obboxs[['duration', 'rel_position']] = test_obboxs['comments'].apply(extract_info).tolist()
train_obboxs.iloc[100:104]

Unnamed: 0,a_bbox,o_bbox,cat_id,area,img_id,comments,label,duration,rel_position
1120,"[1295.0, 134.0, 1296.0, 186.0]","[1296.0, 186.0, 1296.0, 134.0, 1295.0, 134.0, ...","[42, 161]",105,679,instance:#000089;,62,,
1121,"[1321.0, 612.0, 1341.0, 629.0]","[1343.3448486328125, 623.862060546875, 1337.48...","[27, 157]",276,679,instance:#00008a;duration:16;rel_position:-13;,45,16.0,-13.0
1122,"[1340.0, 513.0, 1341.0, 617.0]","[1341.0, 617.0, 1341.0, 513.0, 1340.0, 513.0, ...","[42, 161]",201,679,instance:#00008b;,62,,
1123,"[1358.0, 156.0, 1378.0, 172.0]","[1378.0, 156.0, 1358.0, 156.0, 1358.0, 172.0, ...","[27, 157]",279,679,instance:#00008c;duration:8;rel_position:1;,45,8.0,1.0


#### We need to get rid of the NAN values for symbols that are not notes. Torch wont allow NAN inputs so we will make two new feature masks which mark the "true" values of position/duration and then we can impute the rest with a value unrelated to the true data. Interestingly, rests do not have a duration- we should probably fill these in based on the rest type. We should also calculate the relative position for all of the other symbols

In [9]:
# create a mask for the duration to mark where the duration is relevent
train_obboxs['duration_mask'] = train_obboxs['duration'].notna().astype(int)
test_obboxs['duration_mask'] = test_obboxs['duration'].notna().astype(int)
# set items with no duration to -1
# we may need to reapproach this with another method
train_obboxs['duration'] = train_obboxs['duration'].replace(np.nan,-1)
test_obboxs['duration'] = test_obboxs['duration'].replace(np.nan,-1)

# create a mask for the rel_position to mark where the rel_position is relevent
train_obboxs['rel_position_mask'] = train_obboxs['rel_position'].notna().astype(int)
test_obboxs['rel_position_mask'] = test_obboxs['rel_position'].notna().astype(int)
# set items with no rel_position to 50 (nothing has a position this high)
# we may need to reapproach this with a KNN inference
train_obboxs['rel_position'] = train_obboxs['rel_position'].replace(np.nan,50)
test_obboxs['rel_position'] = test_obboxs['rel_position'].replace(np.nan,50)
train_obboxs.iloc[100:104]

Unnamed: 0,a_bbox,o_bbox,cat_id,area,img_id,comments,label,duration,rel_position,duration_mask,rel_position_mask
1120,"[1295.0, 134.0, 1296.0, 186.0]","[1296.0, 186.0, 1296.0, 134.0, 1295.0, 134.0, ...","[42, 161]",105,679,instance:#000089;,62,-1.0,50.0,0,0
1121,"[1321.0, 612.0, 1341.0, 629.0]","[1343.3448486328125, 623.862060546875, 1337.48...","[27, 157]",276,679,instance:#00008a;duration:16;rel_position:-13;,45,16.0,-13.0,1,1
1122,"[1340.0, 513.0, 1341.0, 617.0]","[1341.0, 617.0, 1341.0, 513.0, 1340.0, 513.0, ...","[42, 161]",201,679,instance:#00008b;,62,-1.0,50.0,0,0
1123,"[1358.0, 156.0, 1378.0, 172.0]","[1378.0, 156.0, 1358.0, 156.0, 1358.0, 172.0, ...","[27, 157]",279,679,instance:#00008c;duration:8;rel_position:1;,45,8.0,1.0,1,1


#### there are some bounding boxes with 0 width and/or height. We will add 1px padding

In [10]:
def adjust_bbox(bbox):
    x_min, y_min, x_max, y_max = bbox
    if x_min == x_max:
        x_min -= 1
        x_max += 1
    if y_min == y_max:
        y_min -= 1
        y_max += 1
    return [x_min, y_min, x_max, y_max]

In [11]:
# Apply the function to the 'a_bbox' column of the DataFrame
train_obboxs['padded_bbox'] = train_obboxs['a_bbox'].apply(adjust_bbox)
test_obboxs['padded_bbox'] = test_obboxs['a_bbox'].apply(adjust_bbox)
train_obboxs['padded_bbox'] = train_obboxs['padded_bbox'].apply(adjust_bbox)
test_obboxs['padded_bbox'] = test_obboxs['padded_bbox'].apply(adjust_bbox)
train_obboxs.head(1)

Unnamed: 0,a_bbox,o_bbox,cat_id,area,img_id,comments,label,duration,rel_position,duration_mask,rel_position_mask,padded_bbox
1020,"[116.0, 139.0, 2315.0, 206.0]","[2315.0, 206.0, 2315.0, 139.0, 116.0, 139.0, 1...","[135, 208]",18945,679,instance:#000010;,155,-1.0,50.0,0,0,"[116.0, 139.0, 2315.0, 206.0]"


#### clean up the df

In [12]:
# clean up
train_obboxs.reset_index(inplace=True)
test_obboxs.reset_index(inplace=True)
train_obboxs.drop(['cat_id','comments'], axis=1, inplace=True)
test_obboxs.drop(['cat_id','comments'], axis=1, inplace=True)
train_obboxs.rename(columns={'index': 'ann_id'}, inplace=True)
test_obboxs.rename(columns={'index': 'ann_id'}, inplace=True)
train_obboxs['ann_id'] = train_obboxs['ann_id'].astype(int)
test_obboxs['ann_id'] = test_obboxs['ann_id'].astype(int)
train_obboxs['area'] = train_obboxs['area'].astype(int)
test_obboxs['area'] = test_obboxs['area'].astype(int)
train_obboxs['img_id'] = train_obboxs['img_id'].astype(int)
test_obboxs['img_id'] = test_obboxs['img_id'].astype(int)
test_obboxs.iloc[100:104]

Unnamed: 0,ann_id,a_bbox,o_bbox,area,img_id,label,duration,rel_position,duration_mask,rel_position_mask,padded_bbox
100,101,"[1466.0, 338.0, 1467.0, 413.0]","[1467.0, 413.0, 1467.0, 338.0, 1466.0, 338.0, ...",152,1180,62,-1.0,50.0,0,0,"[1466.0, 338.0, 1467.0, 413.0]"
101,102,"[1500.0, 211.0, 1520.0, 228.0]","[1522.0, 224.00001525878906, 1517.0, 209.00001...",271,1180,45,8.0,-3.0,1,1,"[1500.0, 211.0, 1520.0, 228.0]"
102,103,"[1500.0, 318.0, 1520.0, 335.0]","[1523.0, 325.0, 1512.0, 314.0, 1497.5, 328.5, ...",275,1180,43,8.0,4.0,1,1,"[1500.0, 318.0, 1520.0, 335.0]"
103,104,"[1519.0, 136.0, 1520.0, 217.0]","[1520.0, 217.0, 1520.0, 136.0, 1519.0, 136.0, ...",164,1180,62,-1.0,50.0,0,0,"[1519.0, 136.0, 1520.0, 217.0]"


## join the tables together to get one big table with the complete info for every annotation-

In [13]:
train_data = pd.merge(train_obboxs, train_images, on='img_id', how='inner')
test_data = pd.merge(test_obboxs, test_images, on='img_id', how='inner')
train_data.drop('ann_ids', axis=1, inplace=True)
test_data.drop('ann_ids', axis=1, inplace=True)
train_data.head(1)

Unnamed: 0,ann_id,a_bbox,o_bbox,area,img_id,label,duration,rel_position,duration_mask,rel_position_mask,padded_bbox,filename,width,height
0,1020,"[116.0, 139.0, 2315.0, 206.0]","[2315.0, 206.0, 2315.0, 139.0, 116.0, 139.0, 1...",18945,679,155,-1.0,50.0,0,0,"[116.0, 139.0, 2315.0, 206.0]",lg-877777775968732096-aug-gonville--page-3.png,2431,3439


## concatenate the barline annotations

In [14]:
barlines_df = pd.read_csv('./data/ds2_dense/barlines.csv')

def convert_str_to_list(coord_str):
    return ast.literal_eval(coord_str)

barlines_df['a_bbox'] = barlines_df['a_bbox'].apply(convert_str_to_list)
barlines_df['o_bbox'] = barlines_df['o_bbox'].apply(convert_str_to_list)
barlines_df['padded_bbox'] = barlines_df['padded_bbox'].apply(convert_str_to_list)

missing_annotations = barlines_df[barlines_df['filename'].isin(train_data['filename'])]
train_data = pd.concat([train_data, missing_annotations], ignore_index=True)

missing_annotations = barlines_df[barlines_df['filename'].isin(test_data['filename'])]
test_data = pd.concat([test_data, missing_annotations], ignore_index=True)

### compute the yolo bounding boxes

In [None]:
def corners_to_yolo(bbox, img_width, img_height):
    polygon = Polygon([(bbox[i], bbox[i + 1]) for i in range(0, len(bbox), 2)])
    min_rect = polygon.minimum_rotated_rectangle

    # Check if the minimum rotated rectangle is a point
    if isinstance(min_rect, Point):
        # Handle the case where the shape is a point by creating a small box around it
        x, y = min_rect.x, min_rect.y
        min_rect = Polygon([(x-1, y-1), (x+1, y-1), (x+1, y+1), (x-1, y+1)])
        return 'invalid'

    # check if symbol is a line and add 1 px padding if so (almost always stems)
    elif isinstance(min_rect, LineString):
        # Handle the case where the shape is a line by padding
        x_coords, y_coords = zip(*min_rect.coords)
        min_x, max_x = min(x_coords), max(x_coords)
        min_y, max_y = min(y_coords), max(y_coords)
        min_rect = Polygon([(min_x-1, min_y-1), (max_x+1, min_y-1), (max_x+1, max_y+1), (min_x-1, max_y+1)])

    corners = np.array(min_rect.exterior.coords)
    edge1 = np.linalg.norm(corners[0] - corners[1])
    edge2 = np.linalg.norm(corners[1] - corners[2])
    width = max(edge1, edge2)
    height = min(edge1, edge2)
    center = min_rect.centroid.coords[0]
    center_x = center[0]
    center_y = center[1]
    angle = np.rad2deg(np.arctan2(corners[1][1] - corners[0][1], corners[1][0] - corners[0][0]))

    center_x /= img_width
    center_y /= img_height
    width /= img_width
    height /= img_height

    return [center_x, center_y, width, height, angle]
    
# Function to convert corners to YOLO format for each row in the DataFrame
def apply_corners_to_yolo(row):
    return corners_to_yolo(row['o_bbox'], row['width'], row['height'])

In [None]:
# Add a column with bounding boxes in (center x, center y, W, H, R)*normalized format
train_data['yolo_bbox'] = train_data.apply(apply_corners_to_yolo, axis=1)
test_data['yolo_bbox'] = test_data.apply(apply_corners_to_yolo, axis=1)
# drop invalid boxes
train_data = train_data[train_data['yolo_bbox']!='invalid']
test_data = test_data[test_data['yolo_bbox']!='invalid']
train_data.head(1)

## aggregate the data so there is one image per line. all other variables will be aggregated into lists

In [15]:
train_data_agg = train_data.groupby('filename').agg({
    'ann_id': lambda x: list(x),
    'a_bbox': lambda x: list(x),
    'o_bbox': lambda x: list(x),
    'padded_bbox': lambda x: list(x),
    'area': lambda x: list(x),
    'duration': lambda x: list(x),
    'duration_mask': lambda x: list(x),
    'rel_position': lambda x: list(x), 
    'rel_position_mask': lambda x: list(x),
    'label': lambda x: list(x),
    'img_id': 'first',  # assuming all entries per image have the same img_id
    'width': 'first',   # assuming all entries per image have the same width
    'height': 'first'  # assuming all entries per image have the same height
}).reset_index()
test_data_agg = test_data.groupby('filename').agg({
    'ann_id': lambda x: list(x),
    'a_bbox': lambda x: list(x),
    'o_bbox': lambda x: list(x),
    'padded_bbox': lambda x: list(x),
    'area': lambda x: list(x),
    'duration': lambda x: list(x),
    'duration_mask': lambda x: list(x),
    'rel_position': lambda x: list(x), 
    'rel_position_mask': lambda x: list(x),
    'label': lambda x: list(x),
    'img_id': 'first',  # assuming all entries per image have the same img_id
    'width': 'first',   # assuming all entries per image have the same width
    'height': 'first'  # assuming all entries per image have the same height
}).reset_index()

In [16]:
train_data_agg.head(1)

Unnamed: 0,filename,ann_id,a_bbox,o_bbox,padded_bbox,area,duration,duration_mask,rel_position,rel_position_mask,label,img_id,width,height
0,lg-101766503886095953-aug-beethoven--page-1.png,"[632316, 632317, 632318, 632319, 632320, 63232...","[[233.0, 376.0, 1866.0, 443.0], [233.0, 833.0,...","[[1866.0, 443.0, 1866.0, 376.0, 233.0, 376.0, ...","[[233.0, 376.0, 1866.0, 443.0], [233.0, 833.0,...","[13425, 9986, 73, 75, 12613, 10958, 105, 269, ...","[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 8.0...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, ...","[50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, -2....","[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, ...","[155, 155, 2, 2, 155, 155, 62, 43, 112, 45, 62...",142.0,1960,2772


### the last thing is to handle the differing image sizes. The problem here is that if you add padding, or resieze the image, you have to go back and adjust the coordinates of all of the annotations for that image. For now, we will just select the most common image size which represents the majority of our images, and use that

In [17]:
max_width = max(train_data['width'].max(), test_data['width'].max())
max_height = max(train_data['height'].max(), test_data['height'].max())
max_width, max_height

(3842, 5434)

In [18]:
train_data_agg['width'].median(), train_data_agg['height'].median() 

(1960.0, 2772.0)

In [19]:
train_data_agg_final = train_data_agg[(train_data_agg['width'] == 1960) & (train_data_agg['height'] == 2772)]
test_data_agg_final = test_data_agg[(test_data_agg['width'] == 1960) & (test_data_agg['height'] == 2772)]
print("Proportion of images that are the median width/height:")
print(len(train_data_agg_final)/len(train_data_agg))        
print(len(test_data_agg_final)/len(test_data_agg))

Proportion of images that are the median width/height:
0.7540381791483113
0.7215909090909091


## Prepare the Torch dataset object

In [21]:
class MusicScoreDataset(Dataset):
    def __init__(self, dataframe, root_dir, transform=None):
        """
        Args:
            dataframe (DataFrame): Pandas DataFrame containing annotations.
            root_dir (string): Directory with all the images.
            transforms (callable, optional): Optional transform to be applied on a sample.
        """
        self.annotations = dataframe
        self.root_dir = root_dir
        # Set default transforms if none are provided
        if transform is None:
            self.transform = transforms.Compose([
                transforms.ToTensor(),  # Convert images to tensors
                # transforms.Normalize(mean=[0.485], std=[0.229])  # Adjust if your image is not RGB
            ])
        else:
            self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.annotations['filename'].iloc[idx])
        image = Image.open(img_name).convert("L") # use grayscale
        if self.transform:
            image = self.transform(image)
        boxes = torch.as_tensor(self.annotations['padded_bbox'].iloc[idx], dtype=torch.float32)
        durations = torch.as_tensor(self.annotations['duration'].iloc[idx], dtype=torch.float32)
        rel_positions = torch.as_tensor(self.annotations['rel_position'].iloc[idx], dtype=torch.float32)
        duration_masks = torch.as_tensor(self.annotations['duration_mask'].iloc[idx], dtype=torch.int32)
        rel_position_masks = torch.as_tensor(self.annotations['rel_position_mask'].iloc[idx], dtype=torch.int32)
        labels = torch.as_tensor(self.annotations['label'].iloc[idx], dtype=torch.int64)
        image_id = torch.tensor([self.annotations['img_id'].iloc[idx]], dtype=torch.int64)
        area = torch.as_tensor([self.annotations['area'].iloc[idx]], dtype=torch.float32)
        iscrowd = torch.zeros((len(boxes),), dtype=torch.int64)  # Assuming no crowd

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["durations"] = durations
        target["rel_positions"] = rel_positions
        target["duration_masks"] = duration_masks
        target["rel_position_masks"] = rel_position_masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        return image, target

In [22]:
dataset = MusicScoreDataset(train_data_agg_final, './data/ds2_dense/images/')

In [23]:
dataset.__getitem__(0)

(tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]]]),
 {'boxes': tensor([[ 233.,  376., 1866.,  443.],
          [ 233.,  833., 1866.,  899.],
          [1633.,  587., 1666.,  589.],
          ...,
          [ 732., 2267.,  752., 2284.],
          [ 751., 2187.,  752., 2272.],
          [ 752., 2188.,  769., 2234.]]),
  'labels': tensor([155, 155,   2,   2, 155, 155,  62,  43, 112,  45,  62,  50,  48,  43,
           62,  62,  62,  43,  45,  45,  43,  69,  62,  62, 139,  62,  62,  43,
           45,  45,  45,  69,  62, 112,  45,  62,  45,  43,  43,  45,  62, 112,
           45,  69,  62, 139,  69,  62,  48,  45,  69,  62,  45,  43,  43, 112,
          112, 111,  69,  62,  62,  45,  43,  43,  43,  62,  45,  62,  62,  50,
          112, 111,  69,  62,  43,  43,  43,  50,  69,

## prepare the model- we will use faster r-cnn resnet50 and define a collate function to handle the changin size of the inputs (since each image will have a different number of annotations, yet Torch expects a fixed size tensor)

In [24]:
def get_model(num_classes):
    # Load a model pre-trained on COCO
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    
    # Replace the classifier with a new one, that has
    # num_classes which is user-defined
    num_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(num_features, num_classes)
    
    return model

In [25]:
def collate_fn(batch):
    images = [item[0] for item in batch]  # Extract images
    batch_targets = [item[1] for item in batch]  # Extract targets

    # Stack images using torch.stack to create a batch
    images = torch.stack(images, 0)

    # Initialize the structure for collated targets
    # We need a list of dictionaries
    collated_targets = []
    keys = batch_targets[0].keys()

    # We iterate through each batch item to separate their components properly
    for index in range(len(batch)):  # Loop over items in the batch
        single_target = {}
        for key in keys:
            # We extract the component for each target key from each batch individually
            # This avoids any mixing of data across the batch items
            single_target[key] = batch_targets[index][key]
        collated_targets.append(single_target)

    return images, collated_targets

## train the model

In [26]:
def train_model(model, data_loader, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        for images, targets in data_loader:
            images = list(image.to(device) for image in images)
            # Ensure targets are dictionaries and move them to the appropriate device
            targets = [{k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in t.items()} for t in targets]

            # Forward and backward passes
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
        print(f"Epoch {epoch+1} of {num_epochs}, Loss: {losses.item()}")

In [27]:
# Setup
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
num_classes = len(unique_labels) + 1  # Define the number of classes including background
model = get_model(num_classes).to(device)
data_loader = DataLoader(MusicScoreDataset(train_data_agg_final, './data/ds2_dense/images/'), batch_size=2, shuffle=True, collate_fn=collate_fn)
optimizer = SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)



In [None]:
# Start training
train_model(model, data_loader, optimizer)

In [33]:
# torch.save(model.state_dict(), './exported_model.pt')

### the model runs! let's test it - you can grab the model from the google drive if you dont want to spend hours training it

In [28]:
# Define the backbone
backbone = resnet_fpn_backbone('resnet50', pretrained=True)
# Create the model
model = FasterRCNN(backbone, num_classes=num_classes)  # num_classes includes the background
model.load_state_dict(torch.load('./exported_model.pt'))
model.eval()  # Set the model to evaluation mode

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /home/daniel/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████████████████████████████████| 97.8M/97.8M [00:08<00:00, 12.7MB/s]


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=1e-05)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=1e-05)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=1e-05)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=1e-05)
          (relu

In [29]:
# Convert DataFrame to a dictionary
label_dict = dict(zip(unique_labels['label'], unique_labels['name']))

In [31]:
# Load the image
image_path = './data/ds2_dense/images/lg-101766503886095953-aug-beethoven--page-1.png'
image = Image.open(image_path).convert("L")
image_tensor = F.to_tensor(image).unsqueeze(0)  # Convert image to tensor

# Perform prediction
with torch.no_grad():
    predictions = model(image_tensor)

image = Image.open(image_path).convert("RGB")
# Draw predictions on the image
draw = ImageDraw.Draw(image)
# Specify a larger font size for the annotations
font = ImageFont.load_default()  # Currently, load_default does not support size adjustment in PIL

for element in range(len(predictions[0]['boxes'])):
    boxes = predictions[0]['boxes'][element].cpu().numpy().astype(int)
    label_id = predictions[0]['labels'][element].item()
    score = predictions[0]['scores'][element].item()

    # Look up the label name using the label dictionary
    label_name = label_dict.get(label_id, 'Unknown')  # Default to 'Unknown' if not found

    if score > 0.5:  # filter out low-confidence predictions
        draw.rectangle([(boxes[0], boxes[1]), (boxes[2], boxes[3])], outline='red', width=3)
        draw.text((boxes[0], boxes[1]-40), f'{label_name}:{score:.2f}', fill='blue', font=font)  
# Save or show image
image.show()

## adding noise to an image

#### helper function to draw image with bbox

In [21]:
def show_image_with_boxes(image_path_or_bin, bounding_boxes):
    """
    Displays an image with bounding boxes drawn over it. Can handle both orthogonal and oriented bounding boxes.

    :param image_path: The path to the image file.
    :param bounding_boxes: A list of tuples representing the bounding boxes.
                           For orthogonal boxes: (x1, y1, x2, y2)
                           For oriented boxes: (x1, y1, x2, y2, x3, y3, x4, y4)
    :param image_binary: An alternative to image_path, a PIL image object to be used directly.
    :param oriented: Boolean flag to indicate whether the bounding boxes are oriented.
    """
    # Load the image
    if type(image_path_or_bin) is str:
        image = Image.open(image_path_or_bin)
    else:
        image = image_path_or_bin

    image = image.convert("RGB")
    draw = ImageDraw.Draw(image)
    
    if len(bounding_boxes[0])==8:
        # Draw each bounding box
        for bbox in bounding_boxes:
            # Assumes oriented bounding box is given as (x1, y1, x2, y2, x3, y3, x4, y4)
            # We need to reorganize this into [(x1, y1), (x2, y2), (x3, y3), (x4, y4)]
            points = [(bbox[i], bbox[i+1]) for i in range(0, len(bbox), 2)]
            draw.polygon(points, outline='red', width=2)
    else: 
        # Draw each bounding box
        for bbox in bounding_boxes:
            # Orthogonal bounding box (x1, y1, x2, y2)
            draw.rectangle(bbox, outline='red', width=2)

    # Show the image
    image.show()

In [22]:
# Example usage
image_path = './data/ds2_dense/images/lg-102414375-aug-beethoven--page-2.png'
image_bin = Image.open(image_path)
bboxes = train_data_agg_final[train_data_agg_final['filename']=='lg-102414375-aug-beethoven--page-2.png']['a_bbox']
obboxes = train_data_agg_final[train_data_agg_final['filename']=='lg-102414375-aug-beethoven--page-2.png']['o_bbox']
bboxes = bboxes.values[0]
obboxes = obboxes.values[0]
show_image_with_boxes(image_bin, bboxes)

### blur - make about 25% of the image blurry (varying intensity randomly)

In [109]:
def apply_random_blur(image_path_or_bin, blur_radius=None):
    
    # Load the image
    if type(image_path_or_bin) is str:
        image = Image.open(image_path_or_bin)
    else:
        image = image_path_or_bin
    
    # set the intensity of the blur
    if blur_radius is None:
        blur_radius = random.randint(1, 3)
        
    # Define the area to blur (x1, y1, x2, y2)
    x1 = random.randint(0, image.width*0.5)
    y1 = random.randint(0, image.width*0.5)
    x2 = x1 + int(image.width*0.5) 
    y2 = y1 + int(image.width*0.5) 

    # Crop the area, apply blur, and paste back
    cropped_area = image.crop((x1, y1, x2, y2))
    blurred_area = cropped_area.filter(ImageFilter.BoxBlur(radius=blur_radius))
    image.paste(blurred_area, (x1, y1, x2, y2))

    return image

In [83]:
# Example usage
image_path = './data/ds2_dense/images/lg-102414375-aug-beethoven--page-2.png'
obboxes = train_data_agg_final[train_data_agg_final['filename']=='lg-102414375-aug-beethoven--page-2.png']['o_bbox']
obboxes = obboxes.values[0]
blurred_image = apply_random_blur(image_path)
show_image_with_boxes(blurred_image, obboxes)

### random zoom

In [101]:
def random_zoom(image_path_or_bin, bounding_boxes, min_zoom=0.8, max_zoom=1.2):
    
    # Load the image
    if type(image_path_or_bin) is str:
        image = Image.open(image_path_or_bin)
    else:
        image = image_path_or_bin
        
    original_width, original_height = image.size
    scale_factor = np.random.uniform(min_zoom, max_zoom)

    # New dimensions
    new_width, new_height = int(original_width * scale_factor), int(original_height * scale_factor)

    # Resize the image
    resized_image = image.resize((new_width, new_height), Image.LANCZOS)

    # Calculate the new image's padding offsets
    pad_width = (original_width - new_width) // 2
    pad_height = (original_height - new_height) // 2

    # Create a new image with a white background
    result_image = Image.new('L', (original_width, original_height), 'white')
    result_image.paste(resized_image, (pad_width, pad_height))

    # Adjust bounding boxes
    adjusted_bboxes = []
    if len(bounding_boxes[0])==8:
        for coords in bounding_boxes:
            adjusted_coords = []
            for i in range(0, len(coords), 2):
                new_x = coords[i] * scale_factor + pad_width
                new_y = coords[i+1] * scale_factor + pad_height
                adjusted_coords.extend([new_x, new_y])
            adjusted_bboxes.append(tuple(adjusted_coords))
    else:
        for x1, y1, x2, y2 in bounding_boxes:
            new_x1 = x1 * scale_factor + pad_width
            new_y1 = y1 * scale_factor + pad_height
            new_x2 = x2 * scale_factor + pad_width
            new_y2 = y2 * scale_factor + pad_height
            adjusted_bboxes.append((new_x1, new_y1, new_x2, new_y2))

    return result_image, adjusted_bboxes

In [102]:
# Example usage
image_path = './data/ds2_dense/images/lg-102414375-aug-beethoven--page-2.png'
bboxes = train_data_agg_final[train_data_agg_final['filename']=='lg-102414375-aug-beethoven--page-2.png']['a_bbox']
bboxes = bboxes.values[0]
zoomed_image, zoomed_bboxes = random_zoom(image_path, obboxes)
show_image_with_boxes(zoomed_image, zoomed_bboxes)

### image rotation

In [117]:
def rotate_image_and_boxes(image_path_or_bin, bounding_boxes, max_rotation_deg=5):
    # Load the image
    if type(image_path_or_bin) is str:
        image = Image.open(image_path_or_bin)
    else:
        image = image_path_or_bin
    
    width, height = image.size

    # Random rotation angle between -5 and 5 degrees
    angle = np.random.uniform(-max_rotation_deg, max_rotation_deg)

    # Rotate image with a white background
    rotated_image = image.rotate(angle, expand=True, fillcolor='white')  # Ensure background is white

    # Crop the image to the original dimensions
    new_width, new_height = rotated_image.size
    left = (new_width - width) // 2
    top = (new_height - height) // 2
    right = left + width
    bottom = top + height
    rotated_image = rotated_image.crop((left, top, right, bottom))

    # Calculate new bounding boxes
    new_bounding_boxes = []
    rad_angle = math.radians(-angle)  # Negative to rotate the points back
    for box in bounding_boxes:
        new_box = []
        if len(bounding_boxes[0])==8:
            points = [(box[i], box[i + 1]) for i in range(0, len(box), 2)]
        else:
            points = [(box[0], box[1]), (box[2], box[1]), (box[2], box[3]), (box[0], box[3])]

        for x, y in points:
            # Translate point to origin
            tx = x - width / 2
            ty = y - height / 2
            # Rotate point
            new_x = (math.cos(rad_angle) * tx - math.sin(rad_angle) * ty) + width / 2
            new_y = (math.sin(rad_angle) * tx + math.cos(rad_angle) * ty) + height / 2
            new_box.extend([new_x, new_y])

        if len(bounding_boxes[0])==8:
            new_bounding_boxes.append(new_box)  # Store the four corner points for OBBs
        else:
            # Convert back to bounding box format
            min_x, min_y = min(new_box[::2]), min(new_box[1::2])
            max_x, max_y = max(new_box[::2]), max(new_box[1::2])
            new_bounding_boxes.append((min_x, min_y, max_x, max_y))

    return rotated_image, new_bounding_boxes

In [105]:
# Example usage
image_path = './data/ds2_dense/images/lg-102414375-aug-beethoven--page-2.png'
obboxes = train_data_agg_final[train_data_agg_final['filename']=='lg-102414375-aug-beethoven--page-2.png']['o_bbox']
obboxes = obboxes.values[0]
rotated_image, new_boxes = rotate_image_and_boxes(image_path, obboxes)
show_image_with_boxes(rotated_image, new_boxes)

#### blur + rotation

In [111]:
# Example usage
image_path = './data/ds2_dense/images/lg-102414375-aug-beethoven--page-2.png'
obboxes = train_data_agg_final[train_data_agg_final['filename']=='lg-102414375-aug-beethoven--page-2.png']['o_bbox']
obboxes = obboxes.values[0]
rotated_image, new_boxes = rotate_image_and_boxes(image_path, obboxes)
blurred_image = apply_random_blur(rotated_image)
show_image_with_boxes(blurred_image, new_boxes)

### warp - parallel

In [125]:
def apply_warp_to_image_and_boxes(image_path_or_bin, bounding_boxes, 
                                  horizontal_warp_range=0.05, vertical_warp_range=0.05):
    # Load the image
    if type(image_path_or_bin) is str:
        image = cv2.imread(image_path_or_bin)
    else: # convert PIL to CV2
        image_array = np.array(image_path_or_bin.convert('RGB'))
        image = image_array[:, :, ::-1]
    
    height, width = image.shape[:2]

    # Randomly select warp ratios within the specified ranges
    horizontal_warp = np.random.uniform(-horizontal_warp_range, horizontal_warp_range)
    vertical_warp = np.random.uniform(-vertical_warp_range, vertical_warp_range)

    # Define source points (original corners of the image)
    src_points = np.float32([[0, 0], [width, 0], [0, height], [width, height]])

    # Define destination points for warping
    dst_points = np.float32([
        [0, 0], 
        [width + horizontal_warp * width, vertical_warp * height], 
        [0, height - vertical_warp * height], 
        [width, height]
    ])

    # Compute the perspective transform matrix
    matrix = cv2.getPerspectiveTransform(src_points, dst_points)

    # Warp the image using the transformation matrix
    warped_image = cv2.warpPerspective(image, matrix, (width, height), borderMode=cv2.BORDER_CONSTANT, borderValue=(255, 255, 255))
    warped_image = Image.fromarray(warped_image).convert('L')
    
    # Adjust bounding boxes
    new_bounding_boxes = []
    for box in bounding_boxes:
        new_box = []
        # Transform each corner of the bounding box
        for i in range(0, len(box), 2):
            point = np.array([[[box[i], box[i+1]]]], dtype='float32')
            # Apply the transformation matrix
            transformed_point = cv2.perspectiveTransform(point, matrix)
            new_box.extend(transformed_point[0][0])
        if not len(bounding_boxes[0])==8:
            # For non-oriented boxes, recalculate to orthogonal bounds
            min_x, min_y = min(new_box[::2]), min(new_box[1::2])
            max_x, max_y = max(new_box[::2]), max(new_box[1::2])
            new_box = [min_x, min_y, max_x, max_y]
        new_bounding_boxes.append(new_box)

    return warped_image, new_bounding_boxes # this is a PIL image

In [127]:
# Example usage
image_path = './data/ds2_dense/images/lg-102414375-aug-beethoven--page-2.png'
bboxes = train_data_agg_final[train_data_agg_final['filename']=='lg-102414375-aug-beethoven--page-2.png']['a_bbox']
bboxes = bboxes.values[0]
warped_image, new_bounding_boxes = apply_warp_to_image_and_boxes(image_path, bboxes)
show_image_with_boxes(warped_image, new_bounding_boxes)

### warp - trapezoidal

In [171]:
def apply_keystone_warp_to_image_and_boxes(image_path_or_bin, bounding_boxes, max_skew_factor=0.05):
    # Load the image
    if type(image_path_or_bin) is str:
        image = cv2.imread(image_path_or_bin)
    else: # convert PIL to CV2
        image_array = np.array(image_path_or_bin.convert('RGB'))
        image = image_array[:, :, ::-1]
    
    height, width = image.shape[:2]

    # Define source points (original corners of the image)
    src_points = np.float32([[0, 0], [width, 0], [0, height], [width, height]])

    # Randomly apply skew factors to each corner
    dst_points = np.float32([
        [np.random.uniform(-max_skew_factor, max_skew_factor) * width, 
         np.random.uniform(-max_skew_factor, max_skew_factor) * height],
        [width + np.random.uniform(-max_skew_factor, max_skew_factor) * width, 
         np.random.uniform(-max_skew_factor, max_skew_factor) * height],
        [np.random.uniform(-max_skew_factor, max_skew_factor) * width, 
         height + np.random.uniform(-max_skew_factor, max_skew_factor) * height],
        [width + np.random.uniform(-max_skew_factor, max_skew_factor) * width, 
         height + np.random.uniform(-max_skew_factor, max_skew_factor) * height]
    ])

    # Compute the perspective transform matrix
    matrix = cv2.getPerspectiveTransform(src_points, dst_points)

    # Warp the image using the transformation matrix
    warped_image = cv2.warpPerspective(image, matrix, (width, height), 
                                       borderMode=cv2.BORDER_CONSTANT, 
                                       borderValue=(255, 255, 255))

    # Convert warped image back to PIL format and convert to grayscale
    warped_image_pil = Image.fromarray(warped_image[:, :, ::-1]).convert('L')

    # Adjust bounding boxes
    new_bounding_boxes = []
    for box in bounding_boxes:
        transformed_points = []
        # Transform each corner of the bounding box
        for i in range(0, len(box), 2):
            point = np.array([[[box[i], box[i + 1]]]], dtype='float32')
            transformed_point = cv2.perspectiveTransform(point, matrix)
            transformed_points.extend(transformed_point[0][0])

        # Recalculate the bounding box to contain all points
        xs = transformed_points[0::2]
        ys = transformed_points[1::2]
        new_box = [min(xs), min(ys), max(xs), max(ys)]
        new_bounding_boxes.append(new_box)

    return warped_image_pil, new_bounding_boxes

In [177]:
# Example usage
image_path = './data/ds2_dense/images/lg-102414375-aug-beethoven--page-2.png'
obboxes = train_data_agg_final[train_data_agg_final['filename']=='lg-102414375-aug-beethoven--page-2.png']['o_bbox']
obboxes = obboxes.values[0]
twarped_image, new_bounding_boxes = apply_keystone_warp_to_image_and_boxes(image_path, obboxes)
show_image_with_boxes(twarped_image, new_bounding_boxes)