# Download and Filter The COCO Dataset

## Prepare Workspace

### Define Google Colab Flag

In [1]:
GOOGLE_COLAB = True

### Mount Google Drive

In [2]:
if GOOGLE_COLAB:
    import os
    from google.colab import drive

    # Check if Google Drive is already mounted
    if not os.path.exists('/content/drive/My Drive'):
        print("Mounting Google Drive...")
        drive.mount('/content/drive')
    else:
        print("Google Drive is already mounted.")

Mounting Google Drive...
Mounted at /content/drive


### Set-up Directories & Install Libraires
Create the directories needed and copy uploaded files into them

In [30]:
if GOOGLE_COLAB:
    !mkdir -p /content/data

    !cp -r /content/drive/MyDrive/eml_challenge/utils /content
    !cp /content/drive/MyDrive/eml_challenge/data/COCO/coco_train2017_person.json /content/data/
    !cp /content/drive/MyDrive/eml_challenge/data/COCO/coco_val2017_person.json /content/data/

### Define Path to Dataset

In [20]:
if GOOGLE_COLAB:
    TEMP_DATA_PATH = "/content/data/"
    DATA_PATH      = "/content/drive/MyDrive/eml_challenge/data/COCO/"

### Append Directory Paths to System Path

In [21]:
import sys
if GOOGLE_COLAB:
    sys.path.append('/content')
    sys.path.append('/content/utils')
    sys.path.append(TEMP_DATA_PATH)

### Import Libraries

In [22]:
# Libraires
from tqdm import tqdm
import urllib.request
from zipfile import ZipFile
import os
import json

## Define Dataset Filtering Function

### Define delete_non_person_images Function

In [28]:
def delete_non_person_images(annotations_file, images_dir):
    """
    Deletes all images from the directory that are not referenced in the JSON file for the "person" category.

    Args:
        annotations_file (str): Path to the filtered JSON annotations file.
        images_dir (str): Directory containing the COCO images.
    """
    # Load the annotations file
    with open(annotations_file, 'r') as f:
        data = json.load(f)

    # Get the list of image IDs that contain the "person" category
    person_image_ids = {image['id'] for image in data['images']}

    # Get the set of image file names to keep
    person_image_filenames = {image['file_name'] for image in data['images']}

    # Iterate through the images in the directory
    print("Scanning images in the directory...")
    for image_file in tqdm(os.listdir(images_dir)):
        # If the file is not in the set of person image filenames, delete it
        if image_file not in person_image_filenames:
            file_path = os.path.join(images_dir, image_file)
            try:
                os.remove(file_path)
                #tqdm.write(f"Deleted: {file_path}")
            except Exception as e:
                tqdm.write(f"Error deleting {file_path}: {e}")

    print("Completed: All non-person images have been deleted.")

## Execute Workspace

### Download & Unpack Training Dataset

In [31]:
train_url = "http://images.cocodataset.org/zips/train2017.zip"
train_set_zip_filepath = TEMP_DATA_PATH + 'train2017.zip'

urllib.request.urlretrieve(train_url, train_set_zip_filepath)

with ZipFile(train_set_zip_filepath, 'r') as zObject:
    zObject.extractall()

### Download & Unpack Validation Dataset

In [32]:
val_url = "http://images.cocodataset.org/zips/val2017.zip"
val_set_zip_filepath = TEMP_DATA_PATH + 'val2017.zip'

urllib.request.urlretrieve(val_url, val_set_zip_filepath)

with ZipFile(val_set_zip_filepath, 'r') as zObject:
    zObject.extractall()

### Test Downloading and Unpacking
In this section it's made sure that the files were downloaded and packed correctly by printing the first 5 file names.

In [34]:
# Training Dataset
# Get the list of all files and directories
train_set_filepath = TEMP_DATA_PATH + 'train2017'
train_dir_list     = os.listdir(train_set_filepath)
print("Files and directories in '", train_set_filepath, "' :")
# prints first 5 files
print(train_dir_list[0:5])

# Validation Dataset
# Get the list of all files and directories
val_set_filepath = TEMP_DATA_PATH + 'val2017'
val_dir_list     = os.listdir(val_set_filepath)
print("Files and directories in '", val_set_filepath, "' :")
# prints first 5 files
print(val_dir_list[0:5])

Files and directories in ' /content/data/train2017 ' :
['000000113599.jpg', '000000447074.jpg', '000000103806.jpg', '000000193465.jpg', '000000185697.jpg']
Files and directories in ' /content/data/val2017 ' :
['000000476704.jpg', '000000201646.jpg', '000000167572.jpg', '000000322944.jpg', '000000574520.jpg']


### Filter Datasets

In [35]:
# Example usage
train_annotations_file = TEMP_DATA_PATH + "coco_train2017_person.json"  # JSON file with only person annotations
val_annotations_file   = TEMP_DATA_PATH + "coco_val2017_person.json"    # JSON file with only person annotations

delete_non_person_images(train_annotations_file, train_set_filepath)
delete_non_person_images(val_annotations_file, val_set_filepath)

print(len(os.listdir(train_set_filepath)))# before: 118287
print(len(os.listdir(val_set_filepath)))  # before: 5000

Scanning images in the directory...


100%|██████████| 118287/118287 [00:04<00:00, 27480.50it/s]


Completed: All non-person images have been deleted.
Scanning images in the directory...


100%|██████████| 5000/5000 [00:00<00:00, 33822.90it/s]

Completed: All non-person images have been deleted.
64115
2693





### Upload Files to Google Drive

In [39]:
!cp -r /content/data/train2017/ /content/drive/MyDrive/eml_challenge/data/COCO/
!cp -r /content/data/val2017/ /content/drive/MyDrive/eml_challenge/data/COCO/

In [37]:
# prompt: print /content/data/train2017 disk size

!du -sh /content/data/train2017
!du -sh /content/data/val2017

9.9G	/content/data/train2017
422M	/content/data/val2017
