# Specific Part of COCO Dataset Package PyPi

Source: 

https://colab.research.google.com/drive/1QuLLsvX-DnOcOVWxcKWglIzDnxV_OHxE?usp=sharing

**Purpose:** Download filtered COCO dataset


## Step1 : Import cocodataset package 

In [None]:
!pip install CocoDataset==0.1.2

Collecting CocoDataset==0.1.2
  Downloading CocoDataset-0.1.2-py3-none-any.whl (4.2 kB)
Installing collected packages: CocoDataset
Successfully installed CocoDataset-0.1.2


## Step 2: Download annotations for all coco dataset classes 

In [None]:
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
!unzip /content/annotations_trainval2017.zip

--2022-04-10 03:02:24--  http://images.cocodataset.org/annotations/annotations_trainval2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 54.231.128.113
Connecting to images.cocodataset.org (images.cocodataset.org)|54.231.128.113|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 252907541 (241M) [application/zip]
Saving to: ‘annotations_trainval2017.zip’


2022-04-10 03:02:27 (96.3 MB/s) - ‘annotations_trainval2017.zip’ saved [252907541/252907541]

Archive:  /content/annotations_trainval2017.zip
  inflating: annotations/instances_train2017.json  
  inflating: annotations/instances_val2017.json  
  inflating: annotations/captions_train2017.json  
  inflating: annotations/captions_val2017.json  
  inflating: annotations/person_keypoints_train2017.json  
  inflating: annotations/person_keypoints_val2017.json  


## Step 3 :Download photos with specified class names from coco dataset

In [None]:
from pycocotools.coco import COCO
from tqdm import tqdm
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import os


# Input: 
#     class_names: class names to be filtered
#     images_count: how many images to be downloaded
#     annotations_path: path to COCO annotation
# Output: N/A
# Purpose: To download filtered COCO dataset in a folder COCO

def coco_dataset_download(class_names,images_count,annotations_path):
    
    # instantiate COCO specifying the annotations json path
    coco = COCO(annotations_path)
    # Specify a list of category names of interest
    catIds = coco.getCatIds(catNms=class_names)
    # Get the corresponding image ids and images using loadImgs
    imgIds_all = []
    for catId in catIds:
        imgIds = coco.getImgIds(catIds=[catId])
        imgIds_all+=imgIds
    imgIds_all = list(set(imgIds_all))
    images = coco.loadImgs(imgIds_all)
    if not os.path.exists('COCO'):
        os.mkdir ('COCO')

    # Save the images into a local folder
    count = 0
    # specified count images for class name
    for im in tqdm(images, position=0, leave=True):
        img_data = requests.get(im['coco_url']).content
        with open('./COCO/'+ im['file_name'], 'wb') as handler:
            handler.write(img_data)
        count+=1
        # print('no.of image:',count)
        # if count >= images_count:
        #     print('finished images download')
        #     break

    print('finished images download')

In [None]:
from coco_dataset import coco_dataset_download as cocod
class_names = ['backpack', 'handbag', 'suitcase', 'cell phone']  #class name example 
images_count=50       #count of images  
annotations_path='/content/annotations/instances_train2017.json' #path of coco dataset annotations 
#call download function 
coco_dataset_download(class_names,images_count,annotations_path)

loading annotations into memory...
Done (t=17.10s)
creating index...
index created!


100%|██████████| 15587/15587 [46:55<00:00,  5.54it/s]

finished images download





## Zip and Download the filtered COCO dataset

In [None]:
!zip -r /content/COCO.zip /content/COCO
from google.colab import files
files.download("/content/COCO.zip")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: content/COCO/000000326832.jpg (deflated 0%)
  adding: content/COCO/000000460234.jpg (deflated 1%)
  adding: content/COCO/000000167955.jpg (deflated 1%)
  adding: content/COCO/000000393207.jpg (deflated 5%)
  adding: content/COCO/000000243927.jpg (deflated 0%)
  adding: content/COCO/000000169510.jpg (deflated 0%)
  adding: content/COCO/000000537962.jpg (deflated 0%)
  adding: content/COCO/000000168287.jpg (deflated 0%)
  adding: content/COCO/000000529586.jpg (deflated 0%)
  adding: content/COCO/000000535682.jpg (deflated 1%)
  adding: content/COCO/000000245298.jpg (deflated 0%)
  adding: content/COCO/000000361359.jpg (deflated 0%)
  adding: content/COCO/000000416186.jpg (deflated 1%)
  adding: content/COCO/000000554729.jpg (deflated 0%)
  adding: content/COCO/000000012269.jpg (deflated 0%)
  adding: content/COCO/000000152328.jpg (deflated 0%)
  adding: content/COCO/000000379261.jpg (deflated 0%)
  adding: content

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Step 4 : (Failed Attempt) Download annotations with specified class names from coco dataset

In [None]:
from pycocotools.coco import COCO
from tqdm import tqdm
import os

# Input: 
#     class_names: class names to be filtered
#     images_count: how many images to be downloaded
#     annotations_path: path to COCO annotation
# Output: N/A
# Purpose: To output filtered COCO annotation

def coco_annotation_filter(class_names, images_count, annotations_path):

    # instantiate COCO specifying the annotations json path
    coco = COCO(annotations_path)
    # Specify a list of category names of interest
    catIds = coco.getCatIds(catNms=class_names)

    # Get the corresponding image ids and images using loadImgs
    imgIds_all = []
    for catId in catIds:
        imgIds = coco.getImgIds(catIds=[catId])
        imgIds_all+=imgIds
    imgIds_all = list(set(imgIds_all))
    images = coco.loadImgs(imgIds_all)

    # Get the corresponding annotation ids and annotations using loadAnns
    annIds = coco.getAnnIds(imgIds=imgIds_all, catIds=catIds, iscrowd=None)
    annotations = coco.loadAnns(annIds)
    print(len(annotations))
    print(images[0])
    print(annotations[0])
    input()

    # specified count images for class name
    count = 0

    for im in tqdm(images, position=0, leave=True):
        count+=1
        print('no.of image:',count)
        if count >= images_count:
            print('finished annotation filtering')
            break

    print('finished annotation filtering')

In [None]:
class_names = ['backpack', 'handbag', 'suitcase', 'cell phone']  #class name example 
images_count=2      #count of images  
annotations_path='/content/annotations/instances_train2017.json' #path of coco dataset annotations 
#call download function 
coco_annotation_filter(class_names,images_count,annotations_path)

loading annotations into memory...
Done (t=30.01s)
creating index...
index created!
33700
{'license': 1, 'file_name': '000000163840.jpg', 'coco_url': 'http://images.cocodataset.org/train2017/000000163840.jpg', 'height': 640, 'width': 359, 'date_captured': '2013-11-18 08:04:45', 'flickr_url': 'http://farm6.staticflickr.com/5099/5418233449_b0b5e02520_z.jpg', 'id': 163840}
{'segmentation': [[191.98, 414.88, 175.27, 420.64, 172.39, 420.64, 167.79, 414.88, 163.75, 410.27, 162.6, 403.36, 167.21, 394.14, 170.67, 390.11, 181.61, 383.2, 192.56, 380.32, 199.47, 383.77, 208.11, 388.96, 221.93, 388.96, 244.98, 386.65, 247.86, 391.26, 212.72, 401.05, 197.74, 408.54]], 'area': 1586.7993500000011, 'iscrowd': 0, 'image_id': 163840, 'bbox': [162.6, 380.32, 85.26, 40.32], 'category_id': 31, 'id': 1433275}


KeyboardInterrupt: ignored

# YOLO-Coco-Dataset-Custom-Classes-Extractor

Source: 

https://github.com/KaranJagtiani/YOLO-Coco-Dataset-Custom-Classes-Extractor

*   Download specific classes from the Coco Dataset for custrom object detection needs.
*   Download multiple classes at the same time (Multi-threaded).
*   Pickup where you left off if your connection is interrupted.

In [None]:
# Install Packages Required: pycocotools
!pip install pycocotools

# Clone this repository
!git clone https://github.com/KaranJagtiani/YOLO-Coco-Dataset-Custom-Classes-Extractor.git
%cd YOLO-Coco-Dataset-Custom-Classes-Extractor

Cloning into 'YOLO-Coco-Dataset-Custom-Classes-Extractor'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 22 (delta 4), reused 8 (delta 1), pack-reused 0[K
Unpacking objects: 100% (22/22), done.
/content/YOLO-Coco-Dataset-Custom-Classes-Extractor


In [None]:
# Download multiple classes
!python coco-extractor.py 'backpack' 'cell phone' 'handbag' 'suitcase'


Classes to download:  ['backpack', 'cell phone', 'handbag', 'suitcase']


Made downloaded_images Directory.

loading annotations into memory...
Done (t=0.71s)
creating index...
index created!

Made downloaded_images/backpack Directory.

Total Images: 228 for class 'backpack'

Made downloaded_images/suitcase Directory.


Made downloaded_images/handbag Directory.

Total Images: 292 for class 'handbag'

Made downloaded_images/cell phone Directory.

Total Images: 105 for class 'suitcase'
Total Images: 214 for class 'cell phone'
handbag. Downloading - 000000349184.jpg
backpack. Downloading - 000000184324.jpg
cell phone. Downloading - 000000376322.jpg
suitcase. Downloading - 000000125952.jpg
handbag. Downloading - 000000520707.jpg
cell phone. Downloading - 000000300039.jpg
suitcase. Downloading - 000000520707.jpg
backpack. Downloading - 000000277005.jpg
handbag. Downloading - 000000184324.jpg
suitcase. Downloading - 000000158744.jpg
cell phone. Downloading - 000000459272.jpg
backpack. Downl

# Annotation: instances_train2017.json

Download filtered training set

In [None]:
import os
files = []
for className in sorted(os.listdir('downloaded_images')):
  print(len(os.listdir('downloaded_images/'+className)))
  files += os.listdir('downloaded_images/'+className)

print(len(files))
print(len(list(set(files))))

11056
9606
13682
4804
39148
31174


# Annotation: instances_val2017.json

Download filtered validation set

In [None]:
import os
files = []
for className in sorted(os.listdir('downloaded_images')):
  print(len(os.listdir('downloaded_images/'+className)))
  files += os.listdir('downloaded_images/'+className)

print(len(files))
print(len(list(set(files))))

456
428
584
210
1678
1316


## Zip and Download the filtered COCO dataset with annotations

In [None]:
!zip -r downloaded_images.zip downloaded_images
from google.colab import files
files.download("downloaded_images.zip")

  adding: downloaded_images/ (stored 0%)
  adding: downloaded_images/backpack/ (stored 0%)
  adding: downloaded_images/backpack/000000001268.jpg (deflated 0%)
  adding: downloaded_images/backpack/000000353180.txt (deflated 43%)
  adding: downloaded_images/backpack/000000025181.jpg (deflated 0%)
  adding: downloaded_images/backpack/000000224807.txt (deflated 28%)
  adding: downloaded_images/backpack/000000065350.jpg (deflated 0%)
  adding: downloaded_images/backpack/000000274460.jpg (deflated 0%)
  adding: downloaded_images/backpack/000000516677.txt (deflated 58%)
  adding: downloaded_images/backpack/000000308793.txt (deflated 31%)
  adding: downloaded_images/backpack/000000151480.txt (deflated 53%)
  adding: downloaded_images/backpack/000000003255.txt (deflated 44%)
  adding: downloaded_images/backpack/000000012670.jpg (deflated 0%)
  adding: downloaded_images/backpack/000000031093.jpg (deflated 5%)
  adding: downloaded_images/backpack/000000507667.txt (deflated 25%)
  adding: download

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>