# Convert Data into COCO format

In [1]:
import os
import re
import cv2
import json

import numpy as np
import pandas as pd

from joblib import Parallel, delayed

from tqdm import tqdm
from tqdm.auto import tqdm  # for notebooks
tqdm.pandas()

## Read and Split Data

In [2]:
## get Splits (train - small or large - vs test)
test_sample = pd.read_csv("/scratch/students/danae/data/data_preparation/samples/pages/test.csv")
train_small_sample = pd.read_csv("/scratch/students/danae/data/data_preparation/samples/pages/train_small.csv")
train_large_sample = pd.read_csv("/scratch/students/danae/data/data_preparation/samples/pages/train_large.csv")

In [3]:
## read labelled data
data = pd.read_parquet("/scratch/students/danae/data/modelling_data/data.parquet.gzip")

In [4]:
# get pages of splits
pages_test = test_sample["page_id"].drop_duplicates()
print(len(pages_test))

pages_train_small = train_small_sample["page_id"].drop_duplicates()
print(len(pages_train_small))

pages_train_large = train_large_sample["page_id"].drop_duplicates()
print(len(pages_train_large))

5000
50000
150000


In [5]:
# split the dataset containing bbox labels
data_test = data[data["page_id"].isin(pages_test)]
pages_test = data_test["page_id"].drop_duplicates()
print(len(pages_test))

data_train_small = data[data["page_id"].isin(pages_train_small)]
pages_train_small = data_train_small["page_id"].drop_duplicates()
print(len(pages_train_small))

data_train_large = data[data["page_id"].isin(pages_train_large)]
pages_train_large = data_train_large["page_id"].drop_duplicates()
print(len(pages_train_large))

4109
41302
123408


In [6]:
# create train-validation splits
validation_prop = 0.2

# small
pages_validation_small = pages_train_small.sample(frac=validation_prop, replace=False).to_list()
pages_train_small = [p for p in pages_train_small if p not in pages_validation_small]
print(len(pages_validation_small)), print(len(pages_train_small)), print(len(pages_validation_small) + len(pages_train_small))

# large
pages_validation_large = pages_train_large.sample(frac=validation_prop, replace=False).to_list()
pages_train_large = [p for p in pages_train_large if p not in pages_validation_large]
print(len(pages_validation_large)), print(len(pages_train_large)), print(len(pages_validation_large) + len(pages_train_large))

8260
33042
41302
24682
98726
123408


(None, None, None)

## Convert to COCO

In [None]:
# pages_test
# pages_train_small, pages_validation_small
# pages_train_large, pages_validation_large

In [51]:
coco_fn = "coco_annotations_test"

coco_df = data[data["page_id"].isin(pages_test)][["page_id", "journal", "iiif_img_base_uri", "bbx_paragraph", "lines_tokens", "ci_id", "ci_tp"]].reset_index(drop=True)
len(coco_df)

84375

In [52]:
coco_df["supercategory"] = coco_df["ci_tp"]
coco_df = coco_df.rename(
    columns={"ci_tp": "category", "bbx_paragraph": "bbox"}
)

In [53]:
# set page ids
page_ids_df = coco_df[["page_id"]].drop_duplicates().reset_index(drop=True)
page_ids_df["id"] = page_ids_df.index
page_ids_df.head(5)

Unnamed: 0,page_id,id
0,actionfem-1930-01-15-a-p0009,0
1,actionfem-1930-10-15-a-p0012,1
2,actionfem-1931-08-15-a-p0017,2
3,actionfem-1932-12-15-a-p0014,3
4,actionfem-1933-09-15-a-p0010,4


In [54]:
# set category ids
category_ids_df = coco_df[["category"]].drop_duplicates().reset_index(drop=True)
category_ids_df["id"] = category_ids_df.index
category_ids_df

Unnamed: 0,category,id
0,article_title,0
1,death_notice,1
2,table,2
3,image,3
4,image_caption,4
5,ad,5
6,weather,6
7,article,7
8,earrings,8


In [55]:

dims_df = pd.read_csv("/scratch/students/danae/data/page_dimensions/page_dims_test.csv") ####### Carefull if not test !!!
dims_df

Unnamed: 0,file_name,img_height,image_width
0,jdpl-1815-07-05-a-p0001.jpg,3936,2473
1,jdpl-1820-07-08-a-p0002.jpg,3993,2572
2,jdpl-1823-02-12-a-p0003.jpg,3998,2581
3,jdpl-1823-02-08-a-p0002.jpg,3995,2587
4,jdpl-1821-10-28-a-p0002.jpg,3994,2535
...,...,...,...
4975,indeplux-1875-10-22-a-p0002.jpg,5711,3944
4976,lepetitparisien-1922-04-17-a-p0002.jpg,6855,4957
4977,indeplux-1924-08-28-a-p0002.jpg,6767,4711
4978,lepetitparisien-1906-08-26-a-p0004.jpg,6827,4899


In [56]:
# get dimensions of images
images_df = page_ids_df.merge(data[["page_id", "iiif_img_base_uri"]], on="page_id")
images_df = images_df.rename(columns={"page_id": "file_name"})
images_df["iiif_img_base_uri"] = images_df["iiif_img_base_uri"].apply(lambda x: x+"/full/full/0/default.jpg")
images_df["file_name"] = images_df["file_name"].apply(lambda x: x+".jpg")

dims_test_df = pd.read_csv("/scratch/students/danae/data/page_dimensions/page_dims_test.csv")
dims_train_small_df = pd.read_csv("/scratch/students/danae/data/page_dimensions/page_dims_train_small.csv")
dims_train_large_df = pd.read_csv("/scratch/students/danae/data/page_dimensions/page_dims_train_large.csv")

dims_df = pd.concat([dims_test_df, dims_train_small_df, dims_train_large_df]).drop_duplicates()
print(len(dims_df))

images_df = pd.merge(dims_df, images_df, on="file_name")
images_df = images_df.drop(columns=["iiif_img_base_uri"])
# images_df = images_df.drop(columns=["file_name"])
# images_df = images_df.rename(columns={"iiif_img_base_uri": "file_name"})
images_df = images_df.drop_duplicates()
images_df

154551


Unnamed: 0,file_name,img_height,image_width,id
0,jdpl-1815-07-05-a-p0001.jpg,3936,2473,494
8,jdpl-1820-07-08-a-p0002.jpg,3993,2572,496
15,jdpl-1823-02-12-a-p0003.jpg,3998,2581,499
20,jdpl-1823-02-08-a-p0002.jpg,3995,2587,498
25,jdpl-1821-10-28-a-p0002.jpg,3994,2535,3962
...,...,...,...,...
84010,courriergdl-1857-02-13-a-p0003.jpg,4795,3260,188
84021,indeplux-1875-10-22-a-p0002.jpg,5711,3944,348
84036,lepetitparisien-1922-04-17-a-p0002.jpg,6855,4957,1084
84089,lepetitparisien-1906-08-26-a-p0004.jpg,6827,4899,1061


In [57]:
# process unique categories
categories_df = coco_df[["supercategory", "category"]]
categories_df = categories_df.merge(category_ids_df, on="category").drop_duplicates()
categories_df = categories_df.rename(
    columns={"category": "name"}
)

categories_df

Unnamed: 0,supercategory,name,id
0,article_title,article_title,0
622,death_notice,death_notice,1
19580,table,table,2
20456,image,image,3
20457,image_caption,image_caption,4
21166,ad,ad,5
28104,weather,weather,6
28109,article,article,7
81471,earrings,earrings,8


In [58]:
# create a mapping from page_id to id
page_to_id = dict(zip(page_ids_df["page_id"], page_ids_df["id"]))

# create a mapping from category to id
category_to_id = dict(zip(category_ids_df["category"], category_ids_df["id"]))

# generate annotations in parallel
annotations_df = coco_df[["page_id", "category", "bbox"]]

# map "page_id" and "category" to ids using the mapping
annotations_df["image_id"] = annotations_df["page_id"].map(page_to_id)
annotations_df["category_id"] = annotations_df["category"].map(category_to_id)
annotations_df["area"] = annotations_df["bbox"].apply(lambda x: x[2] * x[3])  # width * height
annotations_df["iscrowd"] = 0
annotations_df["id"] = range(1, len(annotations_df) + 1) # to check
annotations_df["bbox"] = annotations_df["bbox"].map(list) #.astype(str) # array to list
annotations_df["segmentation"] = annotations_df["bbox"].apply(lambda x: [[x[0], x[1], x[0]+x[2], x[1], x[0]+x[2], x[1]+x[3], x[0], x[1]+x[3]]]).map(list) #.astype(str) # array to list

annotations_df = annotations_df.drop(columns=["page_id", "category"])
annotations_df.sample(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotations_df["image_id"] = annotations_df["page_id"].map(page_to_id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotations_df["category_id"] = annotations_df["category"].map(category_to_id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotations_df["area"] = annotations_df["bbox"].apply(l

Unnamed: 0,bbox,image_id,category_id,area,iscrowd,id,segmentation
83491,"[89, 94, 3292, 78]",2322,8,256776,0,83492,"[[89, 94, 3381, 94, 3381, 172, 89, 172]]"
4717,"[3131, 1863, 1158, 163]",743,0,188754,0,4718,"[[3131, 1863, 4289, 1863, 4289, 2026, 3131, 20..."
25654,"[2972, 1422, 429, 327]",1887,5,140283,0,25655,"[[2972, 1422, 3401, 1422, 3401, 1749, 2972, 17..."


In [59]:
coco_format = {
    "images": images_df.to_dict(orient="records"),
    "annotations": annotations_df.to_dict(orient="records"),
    "categories": categories_df.to_dict(orient="records"),
}
coco_format

{'images': [{'file_name': 'jdpl-1815-07-05-a-p0001.jpg',
   'img_height': 3936,
   'image_width': 2473,
   'id': 494},
  {'file_name': 'jdpl-1820-07-08-a-p0002.jpg',
   'img_height': 3993,
   'image_width': 2572,
   'id': 496},
  {'file_name': 'jdpl-1823-02-12-a-p0003.jpg',
   'img_height': 3998,
   'image_width': 2581,
   'id': 499},
  {'file_name': 'jdpl-1823-02-08-a-p0002.jpg',
   'img_height': 3995,
   'image_width': 2587,
   'id': 498},
  {'file_name': 'jdpl-1821-10-28-a-p0002.jpg',
   'img_height': 3994,
   'image_width': 2535,
   'id': 3962},
  {'file_name': 'jdpl-1814-05-21-a-p0002.jpg',
   'img_height': 3960,
   'image_width': 2454,
   'id': 493},
  {'file_name': 'jdpl-1819-01-08-a-p0002.jpg',
   'img_height': 4000,
   'image_width': 2428,
   'id': 3961},
  {'file_name': 'jdpl-1823-02-12-a-p0004.jpg',
   'img_height': 3995,
   'image_width': 2586,
   'id': 500},
  {'file_name': 'jdpl-1819-01-08-a-p0001.jpg',
   'img_height': 3999,
   'image_width': 2437,
   'id': 495},
  {'fil

In [60]:
coco_format.keys()

dict_keys(['images', 'annotations', 'categories'])

In [61]:
def convert_numpy(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()  # Convert arrays to lists
    raise TypeError(f"Type {type(obj)} not serializable")

# save to JSON
with open(f"/scratch/students/danae/data/model_data_format/yolo/{coco_fn}.json", "w") as f:
    json.dump(coco_format, f, indent=1, default=convert_numpy)