## Simply Convert Data to COCO Format

- Split the training data to train and validation
- Convert both to COCO Formatted JSON

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# Some basic setup:
# import some common libraries
import numpy as np
import pandas as pd
import os
import json
import random
import cv2
import matplotlib.pyplot as plt
import ast
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [None]:
TOP_INPUT_DIR = "/content/drive/My Drive/GWD/global-wheat-detection/"
DATA_TRAIN_DIR = f"{TOP_INPUT_DIR}train/"
DATA_TEST_DIR = f"{TOP_INPUT_DIR}test/"

### Create COCO Base

In [None]:
coco_base = { "info": {},
              "licenses": [], 
              "images": [],
              "annotations": [],
              "categories": []}

### Write dataset info to COCO Format 

In [None]:
coco_base["info"] = {
    "description": "Global Wheat Detection Dataset, Kaggle 2020",
    "url": "https://www.kaggle.com/c/global-wheat-detection/data",
    "version": "1.0",
    "year": 2020,
    "contributor": "http://www.global-wheat.com/contributors/",
    "date_created": "2020/05/29"
}

### Write licenses to COCO Format

In [None]:
coco_base["licenses"].append(
    {
        "url": "https://opensource.org/licenses/MIT",
        "id": 1,
        "name": "MIT License"
    }
)

### Write category to COCO Format

In [None]:
coco_base["categories"].append({"supercategory": "grain","id": 1,"name": "wheat"})

In [None]:
train_df = pd.read_csv(f'{TOP_INPUT_DIR}train.csv')
train_df.head()

Unnamed: 0,image_id,width,height,bbox,source
0,b6ab77fd7,1024,1024,"[834.0, 222.0, 56.0, 36.0]",usask_1
1,b6ab77fd7,1024,1024,"[226.0, 548.0, 130.0, 58.0]",usask_1
2,b6ab77fd7,1024,1024,"[377.0, 504.0, 74.0, 160.0]",usask_1
3,b6ab77fd7,1024,1024,"[834.0, 95.0, 109.0, 107.0]",usask_1
4,b6ab77fd7,1024,1024,"[26.0, 144.0, 124.0, 117.0]",usask_1


In [None]:
uniq_images = train_df.image_id.unique()
len(uniq_images)

3373

#### There are 3373 unique images in the training set

In [None]:
print(train_df.width.unique()[0])
print(train_df.height.unique()[0])

1024
1024


#### All images are of size 1024x1024

### Group bboxes by image

Create one row per image - combining all bboxes for that image into one column. This helps significantly reduce the number of rows to traverse.

In [None]:
def get_bboxes_per_image(df):
    """author: @impiyush"""
    # convert from string list, to python list
    df.bbox = df.bbox.apply(ast.literal_eval)
    # group-by on image id and return all bboxes for that image as a list of lists
    return train_df.groupby('image_id')['bbox'].apply(list).reset_index(name='bboxes')

In [None]:
train_df_bboxes_grped = get_bboxes_per_image(train_df)
train_df_bboxes_grped.head()

Unnamed: 0,image_id,bboxes
0,00333207f,"[[0, 654, 37, 111], [0, 817, 135, 98], [0, 192..."
1,005b0d8bb,"[[765.0, 879.0, 116.0, 79.0], [84.0, 539.0, 15..."
2,006a994f7,"[[437.0, 988.0, 98.0, 36.0], [309.0, 527.0, 11..."
3,00764ad5d,"[[89.0, 256.0, 113.0, 107.0], [216.0, 282.0, 1..."
4,00b5fefed,"[[709.0, 97.0, 204.0, 105.0], [775.0, 250.0, 1..."


In [None]:
assert train_df_bboxes_grped.shape[0] == len(uniq_images), "Number of images differ when grouped"

### Split to train and validation

In [None]:
df_train, df_valid = train_test_split(
    train_df_bboxes_grped,
    test_size=0.05,
    random_state=32,
    shuffle=True
)

df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)

In [None]:
print(df_train.shape[0], df_valid.shape[0])

3204 169


### Write images to COCO Format

In [None]:
coco_base_train = coco_base.copy()
coco_base_valid = coco_base.copy()

In [None]:
def set_coco_images(df):
    """author: @impiyush"""
    images = []
    for _,img in enumerate(tqdm(df.image_id.unique())):
        img_dict = {"license":1, 
                    "height":1024, 
                    "width":1024,
                    "id":img}
        img_dict["file_name"] = f"{img}.jpg"
        images.append(img_dict)

    return images

In [None]:
coco_base_train["images"] = set_coco_images(df_train)
coco_base_valid["images"] = set_coco_images(df_valid)

100%|██████████| 3204/3204 [00:00<00:00, 186602.47it/s]
100%|██████████| 169/169 [00:00<00:00, 314061.75it/s]


Let's check the first three images in the list

In [None]:
coco_base_train["images"][:3]

[{'file_name': '75791012a.jpg',
  'height': 1024,
  'id': '75791012a',
  'license': 1,
  'width': 1024},
 {'file_name': 'a586f39dd.jpg',
  'height': 1024,
  'id': 'a586f39dd',
  'license': 1,
  'width': 1024},
 {'file_name': '2c534b9b6.jpg',
  'height': 1024,
  'id': '2c534b9b6',
  'license': 1,
  'width': 1024}]

In [None]:
coco_base_valid["images"][:3]

[{'file_name': 'ab8fa9772.jpg',
  'height': 1024,
  'id': 'ab8fa9772',
  'license': 1,
  'width': 1024},
 {'file_name': '3f8f6b1a1.jpg',
  'height': 1024,
  'id': '3f8f6b1a1',
  'license': 1,
  'width': 1024},
 {'file_name': 'f1ecbf9d4.jpg',
  'height': 1024,
  'id': 'f1ecbf9d4',
  'license': 1,
  'width': 1024}]

Let's also verify the number of images match, just for sanity

In [None]:
assert len(coco_base_train["images"])==len(df_train), "Number of images differ from df_train"
assert len(coco_base_valid["images"])==len(df_valid), "Number of images differ from df_valid"

### Write annotations to COCO Format

The most important piece of this puzzle

In [None]:
def set_coco_annotations(df):
    """author: @impiyush"""
    annos = []
    id_cnt = 1

    for _,row in tqdm(df.iterrows(), total=len(df)):
    #     print(row)
        anno = {}
        anno['segmentation'] = []
        anno['iscrowd'] = 0
        anno['image_id'] = row['image_id']
        anno['category_id'] = 1
        bboxes = row['bboxes']
    #     annos.append(box)
        for ix, box in enumerate(bboxes):
            anno['bbox'] = box # x,y,w,h
            anno['area'] = box[2] * box[3] # w*h
            anno['id'] = f"{id_cnt:05}"
            annos.append(anno.copy()) # copy is necessary here, otherwise it will always point to the last value of anno
            id_cnt += 1
    
    return annos

In [None]:
coco_base_train['annotations'] = set_coco_annotations(df_train)
coco_base_valid['annotations'] = set_coco_annotations(df_valid)

100%|██████████| 3204/3204 [00:00<00:00, 4443.25it/s]
100%|██████████| 169/169 [00:00<00:00, 4264.86it/s]


### Dump COCO formatted JSON for train and validation sets

In [None]:
with open(f'/content/drive/My Drive/GWD/coco_train.json','w') as train_coco:
    json.dump(coco_base_train, train_coco)

In [None]:
with open(f'/content/drive/My Drive/GWD/coco_valid.json','w') as valid_coco:
    json.dump(coco_base_valid, valid_coco)

In [None]:
with open(f'/content/drive/My Drive/GWD/coco_test.json','w') as valid_cocoс:
    json.dump(None, valid_cocoс)

### There you go, now you have your data in COCO Format. Hope you liked this quick Kernel!