<a href="https://colab.research.google.com/github/copyrightFreeGenAI/copyrightFreeImagesGenAI/blob/main/1.%20MS%20COCO%20Dataset%20Creation/Dataset_Creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Create MS COCO Dataset**

In [None]:
# Set Paths for Fine-tuning Dataset
annTrainInstanceFiles = "/content/gdrive/MyDrive/annotations2014/instances_val2014.json" # Path to validation instance json, download from MS COCO
annTrainCaptionFiles = "/content/gdrive/MyDrive/annotations2014/captions_val2014.json" # Path to validation caption json, download from MS COCO
train_ranking_directory = "/content/gdrive/MyDrive/DATA/COCO/Fine-Tuning/Rankings" # Path to directory where rankings will be saved
train_images_directory = "/content/gdrive/MyDrive/DATA/COCO/Fine-Tuning/Images" # Path to directory where images will be saved

# Set Paths for Benchmark Dataset
annValInstanceFiles = "/content/gdrive/MyDrive/annotations2014/instances_train2014.json" # Path to train instance json, download from MS COCO
annValCaptionFiles = "/content/gdrive/MyDrive/annotations2014/captions_train2014.json" # Path to train caption json, download from MS COCO
val_ranking_directory = "/content/gdrive/MyDrive/DATA/COCO/Validation/Rankings"  # Path to directory where rankings will be save
val_images_directory = "/content/gdrive/MyDrive/DATA/COCO/Validation/Images" # Path to directory where images will be saved

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!python -m spacy download en_core_web_md

In [None]:
from pycocotools.coco import COCO
import skimage.io as io
import os
import re
import json
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16  import preprocess_input, decode_predictions
import numpy as np
from tensorflow.image import resize
import spacy

In [None]:
model = VGG16(weights='imagenet')
nlp = spacy.load("en_core_web_md")

## Fine-tuning Dataset

In [None]:
coco=COCO(annTrainInstanceFiles)
coco_caps=COCO(annTrainCaptionFiles)
cats = coco.loadCats(coco.getCatIds())

### Generate Fine-tuning Rankings

In [None]:
for cat in cats:
    # Skip categories that don't have enough good quality images
    if cat['name'] in ['handbag', 'sports ball', 'toaster', 'hair drier']:
        continue

    print(cat['name'])  # Log current category being processed

    area_dic = {}  # Dictionary to store object area relative to image size
    nlp_dic = {}   # Dictionary to store semantic relevance scores
    dic = {}       # Unused dictionary (could consider removing if not needed)

    category = nlp(cat['name'])  # Parse category name with NLP model

    # Get image IDs that contain this category
    catIds = coco.getCatIds(catNms=[cat['name']])
    imgIds = coco.getImgIds(catIds=catIds)

    # Create a folder to store results for this category
    folder_path = os.path.join(train_ranking_directory, cat['name'])
    os.makedirs(folder_path, exist_ok=True)

    for i in range(min(len(imgIds), 10000)):  # Limit to 10,000 images max
        quality = 0  # Initialize quality score

        img = coco.loadImgs(imgIds[i])[0]
        img_url = img['coco_url']

        # Load captions for the image
        annIds = coco_caps.getAnnIds(imgIds=img['id'])
        anns = coco_caps.loadAnns(annIds)
        captions_list = [item['caption'].lower() for item in anns]
        prompt = ' '.join(captions_list)

        # Skip if the category name is not mentioned in the captions
        if cat['name'] not in prompt:
            continue

        I = io.imread(img_url)  # Load image from URL

        try:
            # Resize image and calculate object area ratio
            resized_image = resize(I, (224, 224)).numpy()
            annIds = coco.getAnnIds(imgIds=img['id'], catIds=catIds, iscrowd=None)
            anns = coco.loadAnns(annIds)
            area = anns[0]['area'] / (img['height'] * img['width'])
            area_dic[i] = area
        except:
            continue  # Skip image if resizing or annotation fails

        # Preprocess image for model prediction
        preprocessed_image = preprocess_input(np.expand_dims(resized_image, axis=0))
        predictions = model.predict(preprocessed_image)
        decoded_predictions = decode_predictions(predictions, top=5)[0]

        # Compute NLP similarity between predicted labels and category
        for j, (imagenet_id, label, score) in enumerate(decoded_predictions):
            label = label.replace("_", " ")
            word = nlp(label)
            quality += category.similarity(word) * score

        nlp_dic[i] = quality  # Store total quality score for the image

    # Save results to JSON files
    with open(os.path.join(folder_path, f"area_{cat['name']}.json"), 'w') as json_file:
        json.dump(area_dic, json_file)

    with open(os.path.join(folder_path, f"nlp_{cat['name']}.json"), 'w') as json_file:
        json.dump(nlp_dic, json_file)

### Retrieve Top 20 Images per Category based on Ranking

In [None]:
for cat in cats:
    if cat['name'] in ['handbag', 'sports ball', 'toaster', 'hair drier']:
      continue
    print(cat['name'])
    area_file = os.path.join(train_ranking_directory, cat['name'], "area_" + cat['name'] + ".json")
    nlp_file = os.path.join(train_ranking_directory, cat['name'], "nlp_" + cat['name'] + ".json")

    # Prepare to save images
    save_dir = os.path.join(train_images_directory, cat['name'])
    os.makedirs(save_dir, exist_ok=True)

    # Load area and NLP data
    with open(area_file, 'r') as file:
        area = json.load(file)
    with open(nlp_file, 'r') as file:
        nlp = json.load(file)

    # Normalize area
    min_area = min(area.values())
    max_area = max(area.values())
    normalized_area = {key: (value - min_area) / (max_area - min_area) for key, value in area.items()}

    # Normalize nlp
    min_nlp = min(nlp.values())
    max_nlp = max(nlp.values())
    normalized_nlp = {key: (value - min_nlp) / (max_nlp - min_nlp) for key, value in nlp.items()}

    # Calculate scores using normalized values
    scores = {key: 0.5 * normalized_area[key] + 0.5 * normalized_nlp[key] for key in area}

    # Get the top 20 scores
    top_20 = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:20]

    # Get category and image IDs
    catIds = coco.getCatIds(catNms=[cat['name']])
    imgIds = coco.getImgIds(catIds=catIds)

    # Save the top 20 images
    for key, _ in top_20:
        img_index = int(key)
        if img_index < len(imgIds):
            img = coco.loadImgs(imgIds[img_index])[0]
            img_url = img['coco_url']
            I = io.imread(img_url)
            file_path = os.path.join(save_dir, f"{key}.jpg")
            io.imsave(file_path, I)
        else:
            print(f"Warning: Key {key} does not correspond to a valid imgId.")

## Benchmark Dataset

In [None]:
coco=COCO(annTrainInstanceFiles)
coco_caps=COCO(annTrainCaptionFiles)
cats = coco.loadCats(coco.getCatIds())

### Generate Benchmark Rankings

In [None]:
for cat in cats:
    # Skip categories lacking sufficient good quality images
    if cat['name'] in ['handbag', 'sports ball', 'toaster', 'hair drier']:
        continue

    print(cat['name'])  # Log current category being processed

    area_dic = {}  # To store normalized object area per image
    nlp_dic = {}   # To store semantic similarity quality scores
    dic = {}       # (Unused dictionary—can be removed if unnecessary)

    category = nlp(cat['name'])  # Process category name with NLP model

    # Get image IDs associated with this category
    catIds = coco.getCatIds(catNms=[cat['name']])
    imgIds = coco.getImgIds(catIds=catIds)

    # Create output directory for results
    folder_path = os.path.join(val_ranking_directory, cat['name'])
    os.makedirs(folder_path, exist_ok=True)

    for i in range(min(len(imgIds), 10000)):  # Limit to 10,000 images
        quality = 0

        img = coco.loadImgs(imgIds[i])[0]
        img_url = img['coco_url']

        # Get all captions for the image
        annIds = coco_caps.getAnnIds(imgIds=img['id'])
        anns = coco_caps.loadAnns(annIds)
        captions_list = [item['caption'].lower() for item in anns]
        prompt = ' '.join(captions_list)

        # Skip images where captions don’t mention the category
        if cat['name'] not in prompt:
            continue

        I = io.imread(img_url)

        try:
            # Resize image and compute area of annotated object
            resized_image = resize(I, (224, 224)).numpy()
            annIds = coco.getAnnIds(imgIds=img['id'], catIds=catIds, iscrowd=None)
            anns = coco.loadAnns(annIds)
            area = anns[0]['area'] / (img['height'] * img['width'])
            area_dic[i] = area
        except:
            continue  # Skip if image or annotation processing fails

        # Run image through prediction model
        preprocessed_image = preprocess_input(np.expand_dims(resized_image, axis=0))
        predictions = model.predict(preprocessed_image)
        decoded_predictions = decode_predictions(predictions, top=5)[0]

        # Score image by NLP similarity between predicted labels and category
        for j, (imagenet_id, label, score) in enumerate(decoded_predictions):
            label = label.replace("_", " ")
            word = nlp(label)
            quality += category.similarity(word) * score

        nlp_dic[i] = quality

    # Save results for this category
    with open(os.path.join(folder_path, f"area_{cat['name']}.json"), 'w') as json_file:
        json.dump(area_dic, json_file)

    with open(os.path.join(folder_path, f"nlp_{cat['name']}.json"), 'w') as json_file:
        json.dump(nlp_dic, json_file)

### Retrieve Top 100 Images per Category based on Ranking

In [None]:
for cat in cats:
    # Skip categories lacking sufficient good quality images
    if cat['name'] in ['handbag', 'sports ball', 'toaster', 'hair drier']:
        continue
    print(cat['name'])
    area_file = os.path.join(val_ranking_directory, cat['name'], "area_" + cat['name'] + ".json")
    nlp_file = os.path.join(val_ranking_directory, cat['name'], "nlp_" + cat['name'] + ".json")

    # Load area and NLP data
    with open(area_file, 'r') as file:
        area = json.load(file)
    with open(nlp_file, 'r') as file:
        nlp = json.load(file)

    # Normalize area
    min_area = min(area.values())
    max_area = max(area.values())
    normalized_area = {key: (value - min_area) / (max_area - min_area) for key, value in area.items()}

    # Normalize nlp
    min_nlp = min(nlp.values())
    max_nlp = max(nlp.values())
    normalized_nlp = {key: (value - min_nlp) / (max_nlp - min_nlp) for key, value in nlp.items()}

    # Calculate scores using normalized values
    scores = {key: 0.5 * normalized_area[key] + 0.5 * normalized_nlp[key] for key in area}

    # Get the top 100 scores
    top_100 = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:100]

    # Prepare to save images
    save_dir = os.path.join(val_images_directory, cat['name'])
    os.makedirs(save_dir, exist_ok=True)

    # Get category and image IDs
    catIds = coco.getCatIds(catNms=[cat['name']])
    imgIds = coco.getImgIds(catIds=catIds)

    # Save the top 100 images
    for key, _ in top_100:
        img_index = int(key)
        if img_index < len(imgIds):
            img = coco.loadImgs(imgIds[img_index])[0]
            img_url = img['coco_url']
            I = io.imread(img_url)
            file_path = os.path.join(save_dir, f"{key}.jpg")
            io.imsave(file_path, I)
        else:
            print(f"Warning: Key {key} does not correspond to a valid imgId.")