## connect to google drive

In [1]:
from google.colab import drive

# Connect to google drive
drive.mount('/content/drive')

# Files directorry in google drive
gdrive_dir = "/content/drive/MyDrive/Project_CCTV_CNN/DATASET"

Mounted at /content/drive


## importing and extract the files

In [2]:
import requests
import zipfile
from pathlib import Path
import shutil
import os

# Setup path to a data folder
image_drive_path = gdrive_dir + "/image.zip"
label_drive_path = gdrive_dir + "/label.zip"

# Setup directory for data
image_dir = "/content/files"
label_dir = "/content/files"

# setup zip directory
colab_zip_image_path = "/content/image.zip"
colab_zip_label_path = "/content/label.zip"

# Prepare the data foldder
os.makedirs(image_dir, exist_ok=True)

# Download the zip and unzip the data (images)
if not os.path.exists(colab_zip_image_path):
  # If file didnt exist
  print("Downloading the image file")
  shutil.copy(image_drive_path, colab_zip_image_path)
  with zipfile.ZipFile(colab_zip_image_path, "r") as zip_ref:
    print("Unzipping image")
    zip_ref.extractall(image_dir)
else:
  # If file exist
  print("File image already exist")

# Label
if not os.path.exists(colab_zip_label_path):
  # If file didnt exist
  print("Downloading the label file")
  shutil.copy(label_drive_path, colab_zip_label_path)
  with zipfile.ZipFile(colab_zip_label_path, "r") as zip_ref:
    print("Unzipping label")
    zip_ref.extractall(label_dir)
else:
  # If file exist
  print("File label already exist")

Downloading the image file
Unzipping image
Downloading the label file
Unzipping label


### Dellete non yolo format and dellete iamge with no label

In [22]:
image_dir = '/content/files/image'
label_dir = '/content/files/label'
# check the number of images and labels
print(len(os.listdir(image_dir)))
print(len(os.listdir(label_dir)))

407
407


In [23]:
import os
import numpy as np

def check_yolo_format(file_path):
    """
    Check if a YOLO format label file is correct.

    Args:
    - file_path (str): Path to the label file.

    Returns:
    - bool: True if all labels in the file are in YOLO format, False otherwise.
    """
    try:
        # Load the data from the label file
        data = np.loadtxt(file_path, delimiter=" ", ndmin=2)

        # Check if each label in the file follows the YOLO format
        for line in data:
            if len(line) != 5:
                print(f"Error: Line {line} in {file_path} does not have 5 elements.")
                return False

            class_id, x_center, y_center, width, height = line

            if not (0 < x_center <= 1 and 0 < y_center <= 1 and 0 < width <= 1 and 0 < height <= 1):
                print(f"Error: Line {line} in {file_path} has values out of range [0, 1].")
                return False

            if not isinstance(x_center, float) or not isinstance(y_center, float) or \
               not isinstance(width, float) or not isinstance(height, float):
                print(f"Error: Line {line} in {file_path} contains non-float values.")
                return False

        return True

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return False

def check_all_labels(img_dir, label_dir):
    """
    Check all label files in the label directory.

    Args:
    - img_dir (str): Directory containing image files.
    - label_dir (str): Directory containing label files.
    """
    image_files = [f for f in os.listdir(img_dir) if f.endswith('.jpg')]
    for img_file in image_files:
        label_file = img_file.replace('.jpg', '.txt')
        img_path = os.path.join(img_dir, img_file)
        label_path = os.path.join(label_dir, label_file)

        if not os.path.exists(label_path):
            print(f"Label file {label_file} does not exist.")
            os.remove(img_path)
            print(f"Image file {img_file} deleted.")
            continue

        is_valid = check_yolo_format(label_path)
        if is_valid:
            pass
        else:
            print(f"{label_file} is NOT in YOLO format. Deleting the file...")
            os.remove(label_path)  # Menghapus file label yang tidak valid
            os.remove(img_path)    # Menghapus file gambar yang sesuai
            print(f"Image file {img_file} deleted.")

# Example usage
image_dir = '/content/files/image'
label_dir = '/content/files/label'
check_all_labels(image_dir, label_dir)


In [28]:
# check the number of images and labels
print(len(os.listdir(image_dir)))
print(len(os.listdir(label_dir)))

407
407


## Split the data to google colab env

In [29]:
from sklearn.model_selection import train_test_split
import os
import shutil

# Direktori sumber
image_dir = "/content/files/image"
label_dir = "/content/files/label"

# Direktori tujuan di Google Drive
gdrive_dir = "/content/drive/MyDrive/dataset5/dataset_split/"
train_image_dir = os.path.join(gdrive_dir, "train/images")
train_label_dir = os.path.join(gdrive_dir, "train/labels")
test_image_dir = os.path.join(gdrive_dir, "test/images")
test_label_dir = os.path.join(gdrive_dir, "test/labels")

# Buat direktori jika belum ada
os.makedirs(train_image_dir, exist_ok=True)
os.makedirs(train_label_dir, exist_ok=True)
os.makedirs(test_image_dir, exist_ok=True)
os.makedirs(test_label_dir, exist_ok=True)

# Ambil semua file gambar
image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg')]
label_files = [f for f in os.listdir(label_dir) if f.endswith('.txt')]

# Pisahkan train dan test (80% train, 20% test)
train_images, test_images = train_test_split(image_files, test_size=0.2, random_state=42)

# Salin file ke direktori train dan test di Google Drive
for image in train_images:
    shutil.copy(os.path.join(image_dir, image), os.path.join(train_image_dir, image))
    label = image.replace('.jpg', '.txt')
    shutil.copy(os.path.join(label_dir, label), os.path.join(train_label_dir, label))

for image in test_images:
    shutil.copy(os.path.join(image_dir, image), os.path.join(test_image_dir, image))
    label = image.replace('.jpg', '.txt')
    shutil.copy(os.path.join(label_dir, label), os.path.join(test_label_dir, label))

print(f"Dataset split complete. Train: {len(train_images)}, Test: {len(test_images)}")

Dataset split complete. Train: 325, Test: 82


In [30]:
print(len(os.listdir(train_image_dir)))
print(len(os.listdir(train_label_dir)))
print(len(os.listdir(test_image_dir)))
print(len(os.listdir(test_label_dir)))

325
325
82
82


### Searching for the right anchor box

#### Ekstract bounding box

In [26]:
import numpy as np
import glob

def extract_box_sizes_from_directory(label_directory):
    sizes = []
    label_files = glob.glob(f"{label_directory}/*.txt")  # Sesuaikan dengan path direktori label Anda

    for file in label_files:
        with open(file, 'r') as f:
            for line in f:
                parts = line.strip().split()
                width = float(parts[3])
                height = float(parts[4])
                sizes.append((width, height))

    return np.array(sizes)

# Ganti dengan path ke direktori label Anda
label_directory = '/content/files/label'
sizes = extract_box_sizes_from_directory(label_directory)

# Find anchors box with k means clustering

#### K-means clustering

In [5]:
from sklearn.cluster import KMeans

def compute_kmeans_anchors(sizes, k):
    kmeans = KMeans(n_clusters=k, random_state=0).fit(sizes)
    return kmeans.cluster_centers_

# Tentukan jumlah anchor boxes yang diinginkan
k = 9
anchors = compute_kmeans_anchors(sizes, k)

# Tampilkan hasil
print("Anchor boxes:")
for anchor in anchors:
    print(f"Width: {anchor[0]:.4f}, Height: {anchor[1]:.4f}")

NameError: name 'sizes' is not defined

In [None]:
# Hitung area dan urutkan berdasarkan ukuran area
anchors_sorted = sorted(anchors, key=lambda x: x[0] * x[1], reverse=True)
# Format anchor boxes untuk YOLO
formatted_anchors = [f"{w:.4f},{h:.4f}" for w, h in anchors_sorted]
for anchor in formatted_anchors:
  print(anchor)