<a href="https://colab.research.google.com/github/dgizdevans/master/blob/main/ai_project/data_sorter_for_unlabeled_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import random
from google.colab import auth
from google.cloud import storage

In [None]:
# Authenticate and initialize the Google Cloud client
auth.authenticate_user()
project_id = "ai-group-project"  # Replace with your Google Cloud project ID
client = storage.Client(project=project_id)
bucket_name = "ai-group-project-data"  # Replace with your bucket name
bucket = client.get_bucket(bucket_name)

In [None]:
# Define source and target paths
source_path = "data/raw/images/"  # Path in the bucket where raw images are stored
target_path = "datasets/unlabeled_data/"  # Path in the bucket for distributed test sets

In [None]:
# Number of test sets
num_test_sets = 5

In [None]:
# Get the list of images
blobs_images = list(bucket.list_blobs(prefix=source_path))
images = [blob.name for blob in blobs_images if blob.name.endswith(('.jpg', '.png', '.jpeg'))]

In [None]:
# Shuffle images
random.shuffle(images)

In [None]:
# Split images into test sets
split_size = len(images) // num_test_sets
test_sets = [images[i * split_size:(i + 1) * split_size] for i in range(num_test_sets)]
# Include any leftover images in the last set
if len(images) % num_test_sets != 0:
    test_sets[-1].extend(images[num_test_sets * split_size:])

In [None]:
# Helper function to copy files in GCS
def copy_blob(bucket, source_blob_name, destination_blob_name):
    source_blob = bucket.blob(source_blob_name)
    destination_blob = bucket.blob(destination_blob_name)
    destination_blob.rewrite(source_blob)

In [None]:
# Distribute the files
stats = {}
for i, test_set in enumerate(test_sets, start=1):
    test_set_folder = f"{target_path}test_set_{i}/"
    stats[f"test_set_{i}"] = len(test_set)
    for image_path in test_set:
        # Define target path for the image
        target_image_path = os.path.join(test_set_folder, os.path.basename(image_path))
        copy_blob(bucket, image_path, target_image_path)

In [None]:
# Display statistics
print("\nData distribution statistics:")
for test_set, count in stats.items():
    print(f"{test_set}: {count} images")

print("\nUnlabeled data has been successfully distributed into 5 test sets.")


Data distribution statistics:
test_set_1: 840 images
test_set_2: 840 images
test_set_3: 840 images
test_set_4: 840 images
test_set_5: 843 images

Unlabeled data has been successfully distributed into 5 test sets.
