In [None]:
import midv500
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import os
import cv2

In [None]:
dataset_dir = './midv500_data/'
export_dir = "midv500"
filename = 'midv500'

### EDA

In [None]:
import os

def count_images_per_type(dataset_path):
    document_types = {}

    # Traverse through each labeled folder (e.g., '01_alb_id', '02_bra_passport', ...)
    for label_folder in os.listdir(dataset_path):
        label_folder_path = os.path.join(dataset_path, label_folder)
        if not os.path.isdir(label_folder_path):
            continue
        
        # Traverse through each document type folder (e.g., 'images/CA', 'images/CS', ...)
        images_folder = os.path.join(label_folder_path, 'images')
        if not os.path.exists(images_folder):
            continue
        
        for doc_type_folder in os.listdir(images_folder):
            doc_type_folder_path = os.path.join(images_folder, doc_type_folder)
            if not os.path.isdir(doc_type_folder_path):
                continue
            
            # Count the number of images in this document type folder
            num_images = len([name for name in os.listdir(doc_type_folder_path) if os.path.isfile(os.path.join(doc_type_folder_path, name))])
            
            # Store the count in dictionary
            if label_folder not in document_types:
                document_types[label_folder] = {}
            
            document_types[label_folder][doc_type_folder] = num_images
    
    return document_types

# Example usage
dataset_path = '.\midv500_data\midv500'
document_counts = count_images_per_type(dataset_path)

# Print document type counts
image_count_dict = {}
total_count = 0
for label_folder, doc_counts in document_counts.items():
    image_count_dict[label_folder] = {}
    label_folder_count = 0
    # print(f"Label Folder: {label_folder}")
    for doc_type, count in doc_counts.items():
        image_count_dict[label_folder][doc_type] = count
        label_folder_count += count
        # print(f"  {doc_type}: {count} images")
    image_count_dict[label_folder]['total_count'] = label_folder_count
    total_count += label_folder_count
image_count_dict['total_count'] = total_count

In [None]:
image_count_dict_json_structure = json.dumps(image_count_dict, indent=4)
with open('.\image_count.json', 'w') as f:
        f.write(image_count_dict_json_structure)

In [None]:
import numpy as np

def image_resolution_stats(dataset_path):
    resolutions = []

    # Traverse through each labeled folder (e.g., '01_alb_id', '02_bra_passport', ...)
    for label_folder in os.listdir(dataset_path):
        label_folder_path = os.path.join(dataset_path, label_folder)
        if not os.path.isdir(label_folder_path):
            continue
        
        # Traverse through each document type folder (e.g., 'images/CA', 'images/CS', ...)
        images_folder = os.path.join(label_folder_path, 'images')
        if not os.path.exists(images_folder):
            continue
        
        for doc_type_folder in os.listdir(images_folder):
            doc_type_folder_path = os.path.join(images_folder, doc_type_folder)
            if not os.path.isdir(doc_type_folder_path):
                continue
            
            # Iterate through image files
            for image_file in os.listdir(doc_type_folder_path):
                image_path = os.path.join(doc_type_folder_path, image_file)
                if os.path.isfile(image_path):
                    image = plt.imread(image_path)
                    resolutions.append(image.shape[:2])  # Get image dimensions
    
    resolutions = np.array(resolutions)
    mean_resolution = np.mean(resolutions, axis=0)
    min_resolution = np.min(resolutions, axis=0)
    max_resolution = np.max(resolutions, axis=0)

    return mean_resolution, min_resolution, max_resolution

# Calculate image resolution statistics
mean_res, min_res, max_res = image_resolution_stats(dataset_path)

# Print summary statistics
print(f"Image Resolution Statistics:")
print(f"  Mean Image Resolution: {mean_res}")
print(f"  Minimum Image Resolution: {min_res}")
print(f"  Maximum Image Resolution: {max_res}")

In [None]:
import os
import json

def convert_annotation(json_file, txt_file, image_width, image_height):
    with open(json_file, 'r') as f:
        data = json.load(f)
        # print((data))
    
    with open(txt_file, 'w') as f:
        # print(data['quad'])
        # for annotation in data['quad']:
        annotation = data['quad']
        # Extract bounding box coordinates (x1, y1, x2, y2, x3, y3, x4, y4)
        x1, y1 = annotation[0]
        x2, y2 = annotation[1]
        x3, y3 = annotation[2]
        x4, y4 = annotation[3]
            
        # Calculate bounding box center and size
        x_center = (x1 + x2 + x3 + x4) / 4.0 / image_width
        y_center = (y1 + y2 + y3 + y4) / 4.0 / image_height
        width = (max(x1, x2, x3, x4) - min(x1, x2, x3, x4)) / image_width
        height = (max(y1, y2, y3, y4) - min(y1, y2, y3, y4)) / image_height

        # Write to file in YOLO format
        class_id = 0  # Assuming single class, update if you have multiple classes
        f.write(f"{class_id} {x_center} {y_center} {width} {height}\n")

def process_dataset(dataset_path):
    for label_folder in os.listdir(dataset_path):
        label_folder_path = os.path.join(dataset_path, label_folder)
        if not os.path.isdir(label_folder_path):
            continue
        
        images_folder = os.path.join(label_folder_path, 'images')
        groundtruth_folder = os.path.join(label_folder_path, 'ground_truth')
        if not os.path.exists(images_folder) or not os.path.exists(groundtruth_folder):
            continue
        
        for doc_type_folder in os.listdir(images_folder):
            doc_type_folder_path = os.path.join(images_folder, doc_type_folder)
            gt_doc_type_folder_path = os.path.join(groundtruth_folder, doc_type_folder)
            if not os.path.isdir(doc_type_folder_path) or not os.path.isdir(gt_doc_type_folder_path):
                continue
            
            for image_file in os.listdir(doc_type_folder_path):
                if not image_file.endswith('.tif'):
                    continue
                
                image_path = os.path.join(doc_type_folder_path, image_file)
                json_file = os.path.join(gt_doc_type_folder_path, image_file.replace('.tif', '.json'))
                txt_file = os.path.join(gt_doc_type_folder_path, image_file.replace('.tif', '.txt'))
                txt_file = txt_file.replace('ground_truth', 'images')
                
                # Load image to get dimensions
                image = plt.imread(image_path)
                image_height, image_width = image.shape[:2]
                # print(json_file)
                # print(text_file)
                # Convert annotation
                convert_annotation(json_file, txt_file, image_width, image_height)

dataset_path = '.\midv500_data\midv500'
process_dataset(dataset_path)

In [None]:
import os
import shutil

# Define paths
current_dir = './'  # Update with your current directory path
data_dir = os.path.join(current_dir, 'data')
images_dir = os.path.join(data_dir, 'images')
labels_dir = os.path.join(data_dir, 'labels')
midv500_dir = 'D:\playground\mdfv500\midv500_data\midv500'  # Update with your actual path to midv500

# Ensure data directory exists, create images and labels directories
os.makedirs(images_dir, exist_ok=True)
os.makedirs(labels_dir, exist_ok=True)

# Function to copy files from source to destination
def copy_files(source_dir, dest_dir):
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            src_file = os.path.join(root, file)
            dest_file = dest_dir + '\\' + file #os.path.join(dest_dir, os.path.relpath(src_file, source_dir))
            os.makedirs(os.path.dirname(dest_file), exist_ok=True)
            shutil.copyfile(src_file, dest_file)
            print(f'Copied {src_file} to {dest_file}')

# # Copy images from midv500/images to data/images
# copy_files(os.path.join(midv500_dir, '01_alb_id/images'), images_dir)
# copy_files(os.path.join(midv500_dir, '02_bra_passport/images'), images_dir)
# # Add more lines for other folders as needed

# # Copy labels from midv500/ground_truth to data/labels
# copy_files(os.path.join(midv500_dir, '01_alb_id/labels'), labels_dir)
# copy_files(os.path.join(midv500_dir, '02_bra_passport/labels'), labels_dir)
# Add more lines for other folders as needed


for dir in [x for x in os.walk(midv500_dir)][0][1]:
    join_dir_images = dir + '\\images'
    join_dir_labels = dir + '\\labels'
    copy_files(os.path.join(midv500_dir, join_dir_images), images_dir)
    copy_files(os.path.join(midv500_dir, join_dir_labels), labels_dir)

In [None]:
import os

def check_dataset(train_file, val_file):
    def check_file_list(file_list):
        with open(file_list, 'r') as f:
            lines = f.readlines()
        missing_labels = []
        for line in lines:
            image_path = line.strip()
            label_path = image_path.replace('/images/', '/ground_truth/').replace('.tif', '.txt')
            if not os.path.exists(label_path):
                missing_labels.append(label_path)
        return missing_labels

    train_missing = check_file_list(train_file)
    val_missing = check_file_list(val_file)

    if train_missing:
        print("Missing labels in training set:")
        for missing in train_missing:
            print(missing)
    else:
        print("All training labels are present.")
    
    if val_missing:
        print("Missing labels in validation set:")
        for missing in val_missing:
            print(missing)
    else:
        print("All validation labels are present.")

# Update these paths to the locations of your train.txt and val.txt
train_file = 'D:/playground/mdfv500/train.txt'
val_file = 'D:/playground/mdfv500/val.txt'

check_dataset(train_file, val_file)

In [None]:
import os
import shutil
import random

def create_dataset_files(dataset_path, output_path, split_ratio=0.8):
    """
    Create train.txt and val.txt files for a dataset located at dataset_path,
    with images in 'images' subfolder and labels in 'labels' subfolder,
    and save these lists to output_path.
    
    Parameters:
    - dataset_path (str): Path to the dataset directory.
    - output_path (str): Path to save the train.txt and val.txt files.
    - split_ratio (float): Ratio of training images to total images (default is 0.8).
    """
    images_dir = os.path.join(dataset_path, 'images')
    labels_dir = os.path.join(dataset_path, 'labels')
    
    # List all image files
    image_files = [f for f in os.listdir(images_dir) if f.endswith('.tif')]
    num_images = len(image_files)
    num_train = int(num_images * split_ratio)
    
    # Randomize the order of images
    random.shuffle(image_files)
    
    # Split into training and validation sets
    train_files = image_files[:num_train]
    val_files = image_files[num_train:]
    
    # Write paths to train.txt and val.txt
    with open(os.path.join(output_path, 'train.txt'), 'w') as f:
        for file in train_files:
            image_path = os.path.join(images_dir, file)
            label_file = os.path.splitext(file)[0] + '.txt'
            label_path = os.path.join(labels_dir, label_file)
            f.write(f"{image_path}\n")
            # Optionally check if label file exists
            if not os.path.exists(label_path):
                print(f"Warning: Label file '{label_path}' not found for '{image_path}'")
    
    with open(os.path.join(output_path, 'val.txt'), 'w') as f:
        for file in val_files:
            image_path = os.path.join(images_dir, file)
            label_file = os.path.splitext(file)[0] + '.txt'
            label_path = os.path.join(labels_dir, label_file)
            f.write(f"{image_path}\n")
            # Optionally check if label file exists
            if not os.path.exists(label_path):
                print(f"Warning: Label file '{label_path}' not found for '{image_path}'")

# Example usage:
dataset_path = '.\\data'
output_path = '.\\'  # Update with your desired output path
create_dataset_files(dataset_path, output_path)


In [None]:
import os
import random
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image

# Function to load YOLO label
def load_yolo_label(label_path):
    with open(label_path, 'r') as file:
        lines = file.readlines()
    return [list(map(float, line.strip().split())) for line in lines]

# Function to display image with labels
def display_image_with_label(image_path, label_path):
    image = Image.open(image_path)
    plt.figure(figsize=(10, 10))
    plt.imshow(image)
    
    ax = plt.gca()
    
    # Load labels
    labels = load_yolo_label(label_path)
    
    img_width, img_height = image.size
    
    for label in labels:
        class_id, x_center, y_center, width, height = label
        x_center *= img_width
        y_center *= img_height
        width *= img_width
        height *= img_height
        x_min = x_center - width / 2
        y_min = y_center - height / 2
        
        rect = patches.Rectangle((x_min, y_min), width, height, linewidth=1, edgecolor='r', facecolor='none')
        ax.add_patch(rect)
        plt.text(x_min, y_min, str(int(class_id)), color='red', fontsize=12)
    
    plt.axis('off')
    plt.show()

# Main code
image_dir = '.\\data\\images'
label_dir = '.\\data\\labels'

image_files = os.listdir(image_dir)
if image_files:
    random_image_file = random.choice(image_files)
    image_path = os.path.join(image_dir, random_image_file)
    
    label_file = random_image_file.replace('.tif', '.txt')  # Assuming labels have the same name with .txt extension
    label_path = os.path.join(label_dir, label_file)
    
    if os.path.exists(label_path):
        display_image_with_label(image_path, label_path)
    else:
        print(f"Label file for {random_image_file} not found.")
else:
    print("No images found in the directory.")