# Merging datasets annotations

We first want to take from the newspaper dataset only the images that could also be used to train the model with our goal in mind. That means, we only keep newspapers that have at least one of the following label : Photograph (0) or Comics/cartoon (3). We then rename all these annotations as just 'Illustration', to stay in line with our dataset.

In [134]:
import json
import random

We start by loading the training data from the newspaper navigator project.

In [5]:
# Load newspaper annotations
f = open('data/trainval.json')
data = json.load(f)
data.keys()

dict_keys(['info', 'licenses', 'categories', 'images', 'annotations'])

We first need to adapt their data to fit our needs : only keep the approriate images and change their labels to 0.

In [27]:
# To change the label to 1
def replace_label(x):
    try: 
        x['category_id'] = 0
        return x
    except:
        return dict()

In [44]:
# Only keep annotations with labels 0 or 3
new_annotations = [replace_label(x) for x in data['annotations'] if x['category_id']==0 or x['category_id']==3]
# Only keep images which have these annotations
images_to_keep = list(set([x['image_id'] for x in new_annotations]))
new_images = [im for im in data['images'] if im['id'] in images_to_keep]
# One big new category : Illustration (instead of 7)
new_categories = [{'id' : 0, 'name': 'Illustration', 'supercategory' : 'Content'}]

In [46]:
# Creating the new dataset
new_data = data.copy()
new_data['categories'] = new_categories
new_data['annotations'] = new_annotations
new_data['images'] = new_images

Now we want to merge the small batches we annotated.

In [114]:
# Load our 4 batches of annotations
f = open('data/batch1.json')
data1 = json.load(f)

f = open('data/batch2.json')
data2 = json.load(f)

f = open('data/batch3.json')
data3 = json.load(f)

f = open('data/batch4.json')
data4 = json.load(f)

In [115]:
# Function to replace the id of the image
def replace_id_images(x, im_offset):
    try: 
        x['id'] = x['id'] + im_offset
        return x
    except:
        return dict()

In [116]:
# Function to replace the ids in the annotation part of the json
def replace_ids_annotations(x, im_offset, an_offset):
    try: 
        x['id'] = x['id'] + an_offset
        x['image_id'] = x['image_id'] + im_offset 
        return x
    except:
        return dict()

In [117]:
# Function to merge two COCO files, changing overlapping indices
def merge_two_files(f1, f2):
    # Ids of last images and annotations of first file
    last_id_image = max([im['id'] for im in f1['images']])
    last_id_annotation = max([an['id'] for an in f1['annotations']])
    
    # Merge images with new indices
    new_DFKV_imgs = [replace_id_images(im, last_id_image + 1) for im in f2['images']]
    new_DFKV_imgs = [*f1['images'], *new_DFKV_imgs]
    
    # Merge annotations with new indices
    new_DFKV_annos = [replace_ids_annotations(im, last_id_image + 1, last_id_annotation + 1) for im in f2['annotations']]
    new_DFKV_annos = [*f1['annotations'], *new_DFKV_annos]
    
    # New merge data file
    new_data = f1.copy()
    new_data['annotations'] = new_DFKV_annos
    new_data['images'] = new_DFKV_imgs
    
    return new_data

In [118]:
# Merge them all four into 1 json file
r1 = merge_two_files(data1, data2)
r2 = merge_two_files(r1, data3)
r3 = merge_two_files(r2, data4)

In [121]:
# Merge with big newspapers file
data_final = merge_two_files(new_data, r3)

Now that we have all annotations, we just modify a little bit the informations about the dataset, and again make sure that all the annotation labels are 0.

In [123]:
# Change all labels to 0
data_final['annotations'] = [replace_label(x) for x in data_final['annotations']]
# Change info
data_final['info']['description'] = 'Modified Beyond Words Dataset (verified) + DFKV'
data_final['info']['URL'] = 'https://github.com/dfk-paris/DFKV-illustrations/tree/main/3_illustration_detection'
data_final['info']['year'] = 2022
data_final['info']['contributor'] = 'LC Labs + DFK Paris'
data_final['info']['date_created'] = '15-03-2022'

We save the file

In [131]:
json_string = json.dumps(data_final)
with open('data/anno_complete.json', 'w') as outfile:
    outfile.write(json_string)

And split it into a 80%-20% cut for the training and test sets

In [146]:
# Split into train and validation sets
imgs_ids = [im['id'] for im in data_final['images']]
train_ids = random.sample(imgs_ids, k=round(len(imgs_ids) * 0.8))
train_imgs = [im for im in data_final['images'] if im['id'] in train_ids]
test_imgs = [im for im in data_final['images'] if im['id'] not in train_ids]
train_annos = [an for an in data_final['annotations'] if an['image_id'] in train_ids]
test_annos = [an for an in data_final['annotations'] if an['image_id'] not in train_ids]

In [147]:
train_data = data_final.copy()
train_data['images'] = train_imgs
train_data['annotations'] = train_annos
test_data = data_final.copy()
test_data['images'] = test_imgs
test_data['annotations'] = test_annos

We finally save the files

In [148]:
json_string = json.dumps(train_data)
with open('data/train_annos.json', 'w') as outfile:
    outfile.write(json_string)
    
json_string = json.dumps(test_data)
with open('data/test_annos.json', 'w') as outfile:
    outfile.write(json_string)