# Dataset 준비

## 1. Downloading Custom Labeled Data from Roboflow

In [1]:
!pip install roboflow

Collecting roboflow
  Downloading roboflow-1.1.9-py3-none-any.whl.metadata (9.1 kB)
Collecting chardet==4.0.0 (from roboflow)
  Downloading chardet-4.0.0-py2.py3-none-any.whl (178 kB)
     ---------------------------------------- 0.0/178.7 kB ? eta -:--:--
     -- ------------------------------------- 10.2/178.7 kB ? eta -:--:--
     -------------------------------------- 178.7/178.7 kB 3.6 MB/s eta 0:00:00
Collecting cycler==0.10.0 (from roboflow)
  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Collecting idna==2.10 (from roboflow)
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
     ---------------------------------------- 0.0/58.8 kB ? eta -:--:--
     ---------------------------------------- 58.8/58.8 kB ? eta 0:00:00
Collecting opencv-python-headless==4.8.0.74 (from roboflow)
  Downloading opencv_python_headless-4.8.0.74-cp37-abi3-win_amd64.whl.metadata (19 kB)
Collecting pyparsing==2.4.7 (from roboflow)
  Downloading pyparsing-2.4.7-py2.py3-none-any.whl (67 kB)
   

In [2]:
# Coupang Dataset for YOLOv8
from roboflow import Roboflow
rf = Roboflow(api_key="rYBl8XHVMjJdpPIT67zF")
project = rf.workspace("inisw91").project("coupang_sweatshirt_2000")
dataset = project.version(5).download("yolov8")

loading Roboflow workspace...
loading Roboflow project...
Dependency ultralytics==8.0.196 is required but found version=8.0.215, to fix: `pip install ultralytics==8.0.196`


Downloading Dataset Version Zip in Coupang_Sweatshirt_2000-5 to yolov8:: 100%|██████████| 83206/83206 [00:04<00:00, 18279.59it/s]




Extracting Dataset Version Zip to Coupang_Sweatshirt_2000-5 in yolov8:: 100%|██████████| 5626/5626 [00:02<00:00, 1963.55it/s]


In [4]:
# Youtube Dataset for YOLOv8
from roboflow import Roboflow
rf = Roboflow(api_key="El60cQAWYQvZ269ZDtRd")
project = rf.workspace("project-9yric").project("sweatshirts_youtube")
dataset = project.version(2).download("yolov8")

loading Roboflow workspace...
loading Roboflow project...
Dependency ultralytics==8.0.196 is required but found version=8.0.215, to fix: `pip install ultralytics==8.0.196`


Downloading Dataset Version Zip in Sweatshirts_youtube-2 to yolov8:: 100%|██████████| 56495/56495 [00:02<00:00, 25018.02it/s]




Extracting Dataset Version Zip to Sweatshirts_youtube-2 in yolov8:: 100%|██████████| 3206/3206 [00:01<00:00, 1955.13it/s]


# 2. Merging Datasets with Fiftyone

In [None]:
!pip install fiftyone

## 2-1. Coupang + Youtube

In [22]:
import fiftyone as fo


# Initialize the main dataset
name = "Coupang+Youtube_4thtry"
dataset = fo.Dataset(name)

# Load and tag the first dataset
dataset_dir = "D:/Data/Coupang_Sweatshirt_2000-5"
splits = ["train", "test", "val"]
for split in splits:
    dataset.add_dir(
        dataset_dir=dataset_dir,
        dataset_type=fo.types.YOLOv5Dataset,
        split=split,
        tags=[split]
    )
    

 100% |███████████████| 1914/1914 [1.9s elapsed, 0s remaining, 1.0K samples/s]       
 100% |█████████████████| 295/295 [285.4ms elapsed, 0s remaining, 1.0K samples/s]      
 100% |█████████████████| 598/598 [547.5ms elapsed, 0s remaining, 1.1K samples/s]      


In [25]:
# Load and tag the second dataset into a separate Dataset object
dataset_dir2 = "D:/Data/Sweatshirts_youtube-2"
for split in splits:
    dataset.merge_dir(
        dataset_dir = dataset_dir2,
        dataset_type = fo.types.YOLOv5Dataset,
        split=split,
        tags=[split],
    )

 100% |███████████████| 1109/1109 [1.3s elapsed, 0s remaining, 879.7 samples/s]         
 100% |█████████████████| 164/164 [187.0ms elapsed, 0s remaining, 877.2 samples/s]     
 100% |█████████████████| 324/324 [352.2ms elapsed, 0s remaining, 920.0 samples/s]      


In [26]:
# Export each split
export_dir = "D:/Data/sweatshirts_final"
for split in splits:
    split_view = dataset.match(fo.ViewField("tags").contains(split))
    split_view.export(
        export_dir=export_dir,
        dataset_type=fo.types.YOLOv5Dataset,
        split=split
    )
    

Directory 'D:/Data/sweatshirts_final' already exists; export will be merged with existing files
 100% |███████████████| 3023/3023 [36.7s elapsed, 0s remaining, 146.8 samples/s]      
Directory 'D:/Data/sweatshirts_final' already exists; export will be merged with existing files
 100% |█████████████████| 459/459 [7.6s elapsed, 0s remaining, 89.1 samples/s]       
Directory 'D:/Data/sweatshirts_final' already exists; export will be merged with existing files
 100% |█████████████████| 922/922 [10.8s elapsed, 0s remaining, 148.0 samples/s]      


## 2-2. Load and Export OpenImages V7

### 1) Train

In [1]:
import fiftyone as fo
import fiftyone.zoo as foz
import os


classes = [
    "Bowl", "Backpack", "Laptop", "Oven", "Toaster", "Mouse", "Bottle",
    "Tennis racket", "Clock", "Tie", "Wine glass", "Spoon", "Scissors",
    "Toothbrush", "Refrigerator", "Couch", "Chair", "Umbrella", "Knife",
    "Suitcase", "Sunglasses"
]

# Specify your custom download directory
dataset_dir = "D://Data"  # Replace with your desired path

# Set the base data directory for FiftyOne
fo.config.dataset_zoo_dir = dataset_dir

# Create an empty dataset
accumulated_dataset = fo.Dataset()

for cls in classes:
    print(f"Loading class: {cls}")

    # Generate a unique dataset name for each class
    dataset_name = f"open-images-v7-{cls.lower().replace(' ', '-')}-train"

    dataset = foz.load_zoo_dataset(
        "open-images-v7",
        split="train",
        classes=[cls],
        label_types=["detections"],
        max_samples=5000,
        seed=51,
        shuffle=True,
        dataset_name=dataset_name  # Use the unique dataset name here
    )

    # Add samples to the accumulated dataset
    accumulated_dataset.add_samples(dataset)

# # 축적된 데이터셋 저장
# accumulated_dataset.persistent = True
# accumulated_dataset_name = "accumulated_openimages_dataset"
# accumulated_dataset.save(accumulated_dataset_name)
# print(f"데이터셋 '{accumulated_dataset_name}' 저장 완료.")

# Now `accumulated_dataset` contains all the data

Loading class: Bowl
Downloading split 'train' to 'D://Data\open-images-v7\train' if necessary
Only found 1464 (<5000) samples matching your requirements
Necessary images already downloaded
Existing download of split 'train' is sufficient
Loading existing dataset 'open-images-v7-bowl-train'. To reload from disk, either delete the existing dataset or provide a custom `dataset_name` to use
 100% |███████████████| 1464/1464 [7.8s elapsed, 0s remaining, 194.6 samples/s]       
Loading class: Backpack
Downloading split 'train' to 'D://Data\open-images-v7\train' if necessary
Only found 715 (<5000) samples matching your requirements
Necessary images already downloaded
Existing download of split 'train' is sufficient
Loading existing dataset 'open-images-v7-backpack-train'. To reload from disk, either delete the existing dataset or provide a custom `dataset_name` to use
 100% |█████████████████| 715/715 [3.2s elapsed, 0s remaining, 231.6 samples/s]      
Loading class: Laptop
Downloading split 

In [2]:
view = accumulated_dataset.filter_labels("ground_truth", 
                             fo.ViewField("label").is_in(classes))

In [3]:
# Export the dataset in YOLOv5 format
view.export(
    export_dir="D://Data//open-images-v7",  # Replace with your desired path
    dataset_type=fo.types.YOLOv5Dataset,
    split="train",  # You can specify the split (train, val, test) if needed
    classes=classes
)

Directory 'D://Data//open-images-v7' already exists; export will be merged with existing files
 100% |█████████████| 44757/44757 [23.4m elapsed, 0s remaining, 28.5 samples/s]      


### 2) Validation

In [4]:
import fiftyone as fo
import fiftyone.zoo as foz
import os


classes = [
    "Bowl", "Backpack", "Laptop", "Oven", "Toaster", "Mouse", "Bottle",
    "Tennis racket", "Clock", "Tie", "Wine glass", "Spoon", "Scissors",
    "Toothbrush", "Refrigerator", "Couch", "Chair", "Umbrella", "Knife",
    "Suitcase", "Sunglasses"
]

# Specify your custom download directory
dataset_dir = "D://Data"  # Replace with your desired path

# Set the base data directory for FiftyOne
fo.config.dataset_zoo_dir = dataset_dir

# Create an empty dataset
accumulated_dataset = fo.Dataset()

for cls in classes:
    print(f"Loading class: {cls}")

    # Generate a unique dataset name for each class
    dataset_name = f"open-images-v7-{cls.lower().replace(' ', '-')}-validation"

    dataset = foz.load_zoo_dataset(
        "open-images-v7",
        split="validation",
        classes=[cls],
        label_types=["detections"],
        max_samples=922,
        seed=51,
        shuffle=True,
        dataset_name=dataset_name  # Use the unique dataset name here
    )

    # Add samples to the accumulated dataset
    accumulated_dataset.add_samples(dataset)

Loading class: Bowl
Downloading split 'validation' to 'D://Data\open-images-v7\validation' if necessary
Downloading 'https://storage.googleapis.com/openimages/2018_04/validation/validation-images-with-rotation.csv' to 'D://Data\open-images-v7\validation\metadata\image_ids.csv'
Downloading 'https://storage.googleapis.com/openimages/v5/class-descriptions-boxable.csv' to 'D://Data\open-images-v7\validation\metadata\classes.csv'
Downloading 'https://storage.googleapis.com/openimages/2018_04/bbox_labels_600_hierarchy.json' to 'C:\Users\jhk16\AppData\Local\Temp\tmpya_fd5lx\metadata\hierarchy.json'
Downloading 'https://storage.googleapis.com/openimages/v5/validation-annotations-bbox.csv' to 'D://Data\open-images-v7\validation\labels\detections.csv'
Only found 49 (<922) samples matching your requirements
Downloading 49 images
 100% |█████████████████████| 49/49 [7.1s elapsed, 0s remaining, 6.8 files/s]       
Dataset info written to 'D://Data\open-images-v7\info.json'
Loading existing dataset 

In [5]:
accumulated_dataset.default_classes = classes
view = accumulated_dataset.filter_labels("ground_truth", 
                             fo.ViewField("label").is_in(classes))

In [6]:
# Export the dataset in YOLOv5 format
view.export(
    export_dir="D://Data//open-images-v7",  # Replace with your desired path
    dataset_type=fo.types.YOLOv5Dataset,
    split="val",  # You can specify the split (train, val, test) if needed
    classes=classes
)

Directory 'D://Data//open-images-v7' already exists; export will be merged with existing files
 100% |███████████████| 1336/1336 [12.8s elapsed, 0s remaining, 153.7 samples/s]      


### 3) Test

In [7]:
import fiftyone as fo
import fiftyone.zoo as foz
import os


classes = [
    "Bowl", "Backpack", "Laptop", "Oven", "Toaster", "Mouse", "Bottle",
    "Tennis racket", "Clock", "Tie", "Wine glass", "Spoon", "Scissors",
    "Toothbrush", "Refrigerator", "Couch", "Chair", "Umbrella", "Knife",
    "Suitcase", "Sunglasses"
]

# Specify your custom download directory
dataset_dir = "D://Data"  # Replace with your desired path

# Set the base data directory for FiftyOne
fo.config.dataset_zoo_dir = dataset_dir

# Create an empty dataset
accumulated_dataset = fo.Dataset()

for cls in classes:
    print(f"Loading class: {cls}")

    # Generate a unique dataset name for each class
    dataset_name = f"open-images-v7-{cls.lower().replace(' ', '-')}-test"

    dataset = foz.load_zoo_dataset(
        "open-images-v7",
        split="test",
        classes=[cls],
        label_types=["detections"],
        max_samples=459,
        seed=51,
        shuffle=True,
        dataset_name=dataset_name  # Use the unique dataset name here
    )

    # Add samples to the accumulated dataset
    accumulated_dataset.add_samples(dataset)

Loading class: Bowl
Downloading split 'test' to 'D://Data\open-images-v7\test' if necessary
Downloading 'https://storage.googleapis.com/openimages/2018_04/test/test-images-with-rotation.csv' to 'D://Data\open-images-v7\test\metadata\image_ids.csv'
Downloading 'https://storage.googleapis.com/openimages/v5/class-descriptions-boxable.csv' to 'D://Data\open-images-v7\test\metadata\classes.csv'
Downloading 'https://storage.googleapis.com/openimages/2018_04/bbox_labels_600_hierarchy.json' to 'C:\Users\jhk16\AppData\Local\Temp\tmpkwtp17m7\metadata\hierarchy.json'
Downloading 'https://storage.googleapis.com/openimages/v5/test-annotations-bbox.csv' to 'D://Data\open-images-v7\test\labels\detections.csv'
Only found 147 (<459) samples matching your requirements
Downloading 147 images
 100% |███████████████████| 147/147 [6.7s elapsed, 0s remaining, 44.0 files/s]      
Dataset info written to 'D://Data\open-images-v7\info.json'
Loading 'open-images-v7' split 'test'
 100% |█████████████████| 147/147

In [8]:
accumulated_dataset.default_classes = classes
view = accumulated_dataset.filter_labels("ground_truth", 
                             fo.ViewField("label").is_in(classes))

In [9]:
# Export the dataset in YOLOv5 format
view.export(
    export_dir="D://Data//open-images-v7",  # Replace with your desired path
    dataset_type=fo.types.YOLOv5Dataset,
    split="test",  # You can specify the split (train, val, test) if needed
    classes=classes
)

Directory 'D://Data//open-images-v7' already exists; export will be merged with existing files
 100% |███████████████| 3538/3538 [26.2s elapsed, 0s remaining, 59.6 samples/s]       


## 2-3. OpenImagesV7 + Custom Dataset

In [10]:
import fiftyone as fo


# Initialize the main dataset
name = "OpenImages+Custom-1sttry"
dataset = fo.Dataset(name)

# Load and tag the first dataset
dataset_dir = "D:\Data\open-images-v7"
splits = ["train", "test", "val"]
for split in splits:
    dataset.add_dir(
        dataset_dir=dataset_dir,
        dataset_type=fo.types.YOLOv5Dataset,
        split=split,
        tags=[split]
    )

 100% |█████████████| 42679/42679 [2.7m elapsed, 0s remaining, 253.9 samples/s]      
 100% |███████████████| 3380/3380 [11.0s elapsed, 0s remaining, 309.5 samples/s]      
 100% |███████████████| 1279/1279 [4.2s elapsed, 0s remaining, 303.8 samples/s]      


In [11]:
# Load and tag the second dataset into a separate Dataset object
dataset_dir2 = "D://Data//sweatshirts_final"
for split in splits:
    dataset.merge_dir(
        dataset_dir = dataset_dir2,
        dataset_type = fo.types.YOLOv5Dataset,
        split=split,
        tags=[split],
    )

 100% |███████████████| 3023/3023 [11.3s elapsed, 0s remaining, 279.2 samples/s]      
 100% |█████████████████| 459/459 [1.7s elapsed, 0s remaining, 265.9 samples/s]         
 100% |█████████████████| 922/922 [3.4s elapsed, 0s remaining, 283.8 samples/s]      


In [14]:
new_classes = [
    "Bowl", "Backpack", "Laptop", "Oven", "Toaster", "Mouse", "Bottle",
    "Tennis racket", "Clock", "Tie", "Wine glass", "Spoon", "Scissors",
    "Toothbrush", "Refrigerator", "Couch", "Chair", "Umbrella", "Knife",
    "Suitcase", "Sunglasses","sweatshirts"
]

# Export each split
export_dir = "D://Data//merged_final_new"
for split in splits:
    split_view = dataset.match(fo.ViewField("tags").contains(split))
    split_view.export(
        export_dir=export_dir,
        dataset_type=fo.types.YOLOv5Dataset,
        split=split,
        classes=new_classes
    )
    

Directory 'D://Data//merged_final_new' already exists; export will be merged with existing files
 100% |█████████████| 45702/45702 [38.0m elapsed, 0s remaining, 46.0 samples/s]      
Directory 'D://Data//merged_final_new' already exists; export will be merged with existing files
 100% |███████████████| 3839/3839 [2.5m elapsed, 0s remaining, 63.6 samples/s]      
Directory 'D://Data//merged_final_new' already exists; export will be merged with existing files
 100% |███████████████| 2201/2201 [1.1m elapsed, 0s remaining, 72.9 samples/s]      


In [1]:
print("end")

end
