# Filter Data
In this section we will be fixing the dataset (correcting size mismatch) and preparing to to feed into our model for training.

In [1]:
import warnings
warnings.filterwarnings("ignore")

# import some common libraries
import numpy as np
import pandas as pd
import json
from tqdm.notebook import tqdm

# utilities
from pprint import pprint # For beautiful print!
from collections import OrderedDict
import os
import json
import shutil

# For data visualisation
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import cv2

# For reading annotations file
from pycocotools.coco import COCO

## Load data

In [2]:
TRAIN_ANNOTATIONS_PATH = "../bigdata/train/new_ann.json"
TRAIN_IMAGE_DIRECTIORY = "../bigdata/train/images/"

VAL_ANNOTATIONS_PATH = "../bigdata/val/new_ann.json"
VAL_IMAGE_DIRECTIORY = "../bigdata/val/images/"

# Reading the annotation files
with open(TRAIN_ANNOTATIONS_PATH) as f:
  train_annotations_data = json.load(f)

with open(VAL_ANNOTATIONS_PATH) as f:
  val_annotations_data = json.load(f)

train_coco = COCO(TRAIN_ANNOTATIONS_PATH)

loading annotations into memory...
Done (t=1.39s)
creating index...
index created!


Remove the space at the end of the category names if they have one.

In [3]:
# strip the space at the end of the category names if they have one
for category in train_annotations_data['categories']:
  category['name_readable'] = category['name_readable'].strip()

for category in val_annotations_data['categories']:
  category['name_readable'] = category['name_readable'].strip()

for category in train_coco.dataset['categories']:
  category['name_readable'] = category['name_readable'].strip()

## Filter data

Create a dictionary (key: name_readable, value: id) that only contains food categories that belong in a kid's diet.

In [4]:
# Reading all classes
category_ids = train_coco.loadCats(train_coco.getCatIds())
category_names = [_["name_readable"] for _ in category_ids]

# create a dictionary with category id as key and category name as value
all_category_dict = dict()
for d in category_ids:
    name = d["name_readable"].strip()
    all_category_dict[name] = d["id"]

print(all_category_dict)

# create a list of kids categories from the file foods-for-kids.txt
kids_category_list = list()
with open('files/foods-for-kids.txt', "r") as file:
    for line in file:
        item = line.strip()
        kids_category_list.append(item)

# create new dictionary with only kids ids
kids_category_ids = set()
for key in list(all_category_dict.keys()):
    if key in kids_category_list:
        kids_category_ids.add(all_category_dict[key])

print(len(kids_category_ids))
print(kids_category_ids)

{'Bread, wholemeal': 1565, 'Jam': 2099, 'Water': 2578, 'Bread, sourdough': 1556, 'Banana': 1154, 'Soft cheese': 1352, 'Ham, raw': 1893, 'Hard cheese': 1310, 'Cottage cheese': 1264, 'Bread, half white': 1536, 'Coffee, with caffeine': 2512, 'Fruit salad': 1166, 'Pancakes': 2949, 'Tea': 2498, 'Salmon, smoked': 2973, 'Avocado': 1056, 'Spring onion / scallion': 1111, 'Ristretto, with caffeine': 2524, 'Ham': 1886, 'Egg': 2022, 'Bacon, frying': 1915, 'Chips, french fries': 1013, 'Juice, apple': 2446, 'Chicken': 1788, 'Tomato, raw': 1069, 'Broccoli': 1085, 'Shrimp, boiled': 259, 'Beetroot, steamed, without addition of salt': 50, 'Carrot, raw': 1078, 'Chickpeas': 1143, 'French salad dressing': 2743, 'Pasta, Hörnli': 1487, 'Sauce, cream': 2730, 'Meat balls': 8025, 'Pasta': 1483, 'Tomato sauce': 2738, 'Cheese': 1311, 'Pear': 1157, 'Cashew nut': 1213, 'Almonds': 1210, 'Lentils': 1144, 'Mixed vegetables': 1022, 'Peanut butter': 1203, 'Apple': 1151, 'Blueberries': 1169, 'Cucumber': 1061, 'Cocoa powd

Save dictionary to a text file.

In [5]:
# print all categories dict in dictionary format to a text file
with open('files/foods-for-kids-dict.txt', 'w') as file:
    file.write(json.dumps(all_category_dict))

#### Filter data functions

In [6]:
def get_filtered_category_ids(data, kids_category_ids):
    """
    Gets the category ids for each category in data[categories] that has a name in all_category_dict

    Input:
    data: json file with annotations
    all_category_dict: dictionary with category names as keys and category ids as values

    Returns:
    filtered_category_ids: list of category ids for each category in data[categories] that has a name in all_category_dict
    """

    filtered_category_ids = set()
    for category in data['categories']:
        if category['id'] in kids_category_ids:
            filtered_category_ids.add(category['id'])

    return filtered_category_ids

In [7]:
def get_filtered_annotation_ids(data, kids_category_ids):
    """
    Gets the annotation ids for each annotation in data[annotation] that has a category in all_category_dict

    Input:
    data: json file with annotations
    all_category_dict: dictionary with category names as keys and category ids as values

    Returns:
    filtered_annotation_ids: list of annotation ids for each annotation in data[annotation] that has a category in all_category_dict
    """

    filtered_annotation_ids = set()
    for annotation in data['annotations']:
        if annotation['category_id'] in kids_category_ids:
            filtered_annotation_ids.add(annotation['id'])

    return filtered_annotation_ids

In [8]:
def get_filtered_image_ids(data, filtered_annotation_ids):
    """
    Gets the image ids for each image in data[images] that has an annotation in filtered_annotation_ids

    Input:
    data: json file with annotations
    filtered_annotation_ids: list of annotation ids for each annotation in data[annotation] that has a category in all_category_dict

    Returns:
    filtered_image_ids: list of image ids for each image in data[images] that has an annotation in filtered_annotation_ids
    """

    filtered_image_ids = set()
    for image in data['images']:
        if image['id'] in [annotation['image_id'] for annotation in data['annotations'] if annotation['id'] in filtered_annotation_ids]:
            filtered_image_ids.add(image['id'])

    return filtered_image_ids

In [9]:
def move_images(data, input_images_path, output_images_path, filtered_image_ids):
    """
    Copy images from source_dir to destination_dir if the image id is in image_ids.

    Input:
    data: Dictionary containing 'images' and their details
    input_images_path: Directory with images
    destination_dir: Directory to move images to
    image_ids: List of image ids for each image that should be moved

    Returns:
    None
    """

    for image_id in filtered_image_ids:
        # Find the image name for the given image ID
        image_name = [image['file_name'] for image in data['images'] if image['id'] == image_id][0]
        
        # Construct source and destination file paths
        source_path = os.path.join(input_images_path, image_name)
        destination_path = os.path.join(output_images_path, image_name)
        
        # Ensure the destination directory exists
        os.makedirs(os.path.dirname(destination_path), exist_ok=True)
        
        # Copy the file from source to destination
        shutil.copyfile(source_path, destination_path)

In [10]:
def construct_filtered_dataset(data, filtered_category_ids, filtered_annotation_ids, filtered_image_ids):
    """
    Constructs a filtered dataset with only the categories, annotations and images that are in filtered_category_ids, filtered

    Input:
    data: json file with annotations. Dictionary with keyes: categories, annotations, images. Each dictionary contains a list of dictionaries.

    Returns:
    filtered_data: dictionary with only the categories, annotations and images that are in filtered_category_ids, filtered_annotation_ids and filtered_image_ids
    """

    filtered_data = dict()
    filtered_data['categories'] = [category for category in data['categories'] if category['id'] in filtered_category_ids]
    filtered_data['annotations'] = [annotation for annotation in data['annotations'] if annotation['id'] in filtered_annotation_ids]
    filtered_data['images'] = [image for image in data['images'] if image['id'] in filtered_image_ids]

    return filtered_data

In [11]:
def filter_data_kids(input_file_path, kids_category_ids, output_file_path):
    """
    Filters the COCO format annotations in the given JSON file based on the specified categories.
    Creates a new image directory that includes the images in the original image directory that are also in the filtered annotations.
    
    Input:
    - input_file_path (str): The path to the directory with the original annotations and images.
    - all_category_dict (dict): A dictionary with category names as keys and their respective IDs as values.
    - output_file_path (str): The path to the directory where the filtered JSON file and images will be saved.
    
    Returns:
    None: The function writes the filtered data to a file and does not return anything.
    """

    # Ensure the output directory is clean
    if os.path.exists(output_file_path):
        shutil.rmtree(output_file_path)
    os.makedirs(output_file_path)  # Recreate the output directory
    
    # Define paths
    input_ann_path = os.path.join(input_file_path, 'new_ann.json')
    input_images_path = os.path.join(input_file_path, 'images')
    output_ann_path = os.path.join(output_file_path, 'final_ann.json')
    output_images_path = os.path.join(output_file_path, 'images')
    
    # Load the JSON file
    with open(input_ann_path, 'r') as file:
        data = json.load(file)

    # get filtered the category ids
    filtered_category_ids = get_filtered_category_ids(data, kids_category_ids)

    # get filtered the annotation ids
    filtered_annotation_ids = get_filtered_annotation_ids(data, kids_category_ids)

    # get filtered the image ids
    filtered_image_ids = get_filtered_image_ids(data, filtered_annotation_ids)

    # check that all filtered_image_ids are in the images directory
    image_files = os.listdir(input_images_path)
    image_ids = set([int(image_file.split('.')[0]) for image_file in image_files])
    assert filtered_image_ids.issubset(image_ids), "Not all filtered image ids are in the images directory"

    # construct the filtered dataset
    filtered_data = construct_filtered_dataset(data, filtered_category_ids, filtered_annotation_ids, filtered_image_ids)

    # write the filtered dataset to a new JSON file
    with open(output_ann_path, 'w') as file:
        json.dump(filtered_data, file)

    # move the images
    move_images(data, input_images_path, output_images_path, filtered_image_ids)

    # print the number of categories, annotations and images in the filtered dataset
    print(f"Number of categories: {len(filtered_data['categories'])}")
    print(f"Number of annotations: {len(filtered_data['annotations'])}")
    print(f"Number of images: {len(filtered_data['images'])}")

In [12]:
train_input_path = '../bigdata/train'
val_input_path = '../bigdata/val'

train_output_path = '../filtered_data/train'
val_output_path = '../filtered_data/val'

#### Filter kids data - validation

In [13]:
filter_data_kids(val_input_path, kids_category_ids, val_output_path)

Number of categories: 102
Number of annotations: 782
Number of images: 551


#### Filter kids data - training

In [14]:
filter_data_kids(train_input_path, kids_category_ids, train_output_path)

Number of categories: 102
Number of annotations: 36501
Number of images: 23568


## Tests

Check that all images in annotations are in the image folder.

In [15]:
# print current directory
print(os.getcwd())

missing_images_ann = set()
non_missing_images = set()
missing_images_folder = set()

with open('../filtered_data/train/final_ann.json') as f:
  train_ann_filtered = json.load(f)

for ann in train_ann_filtered['annotations']:
    # get the category id of the annotation
    ann_category_id = ann['category_id']
    cat_found = False
    for cat in train_ann_filtered['categories']:
        if cat['id'] == ann_category_id:
            cat_found = True
            break
    if not cat_found:
        print('Error:', ann_category_id, 'not in train_ann_filtered')

    # get the image id of the annotation
    ann_image_id = ann['image_id']
    img_found = False
    for img in train_ann_filtered['images']:
        if img['id'] == ann_image_id:
            img_found = True
            break
    if not img_found:
        missing_images_ann.add(ann_image_id)
    
    # get the file name of the image
    ann_image_file_name = [img['file_name'] for img in train_ann_filtered['images'] if img['id'] == ann_image_id][0]
    if ann_image_file_name not in os.listdir('../filtered_data/train/images'):
        missing_images_folder.add(ann_image_file_name)
    
    # add the image_id to the non_missing_images set
    non_missing_images.add(ann_image_id)

print('Number of missing images in the annotations:', len(missing_images_ann))
print('Number of non-missing images:', len(non_missing_images))
print('Number of missing images in the folder:', len(missing_images_folder))

/Users/danielmedina/Documents/Duke/classes/spring-2024/aipi540/AIPI540-project1/notebooks
Number of missing images in the annotations: 0
Number of non-missing images: 23568
Number of missing images in the folder: 0
