In [13]:
%pwd

'/home/gourav/ML/PCB Fault Detection/research'

In [14]:
import os
os.chdir("../")

In [15]:
%pwd

'/home/gourav/ML/PCB Fault Detection'

In [23]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataProcessingConfig:
    source_dir: Path
    raw_data_dir: Path
    processed_data_dir: Path
    split_data_dir: Path

    image_size: list
    train_data_size: float
    val_data_size: float

In [24]:
# importing constants and utils
from src.YOLO_V8.constants import *
from src.YOLO_V8.utils.common import read_yaml, create_directories

In [25]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

    def get_data_process_config(self) -> DataProcessingConfig:
        config = self.config.data_processing
        params = self.params

        create_directories([config.raw_data_dir, config.processed_data_dir, config.split_data_dir])


        data_processing_config = DataProcessingConfig(
            source_dir= config.source_dir,
            raw_data_dir= config.raw_data_dir,
            processed_data_dir= config.processed_data_dir,
            split_data_dir = config.split_data_dir,
            image_size= params.image_size,
            train_data_size= params.train_data_size,
            val_data_size= params.val_data_size
        )

        return data_processing_config

In [26]:
import os
import shutil
import random
import pybboxes as pbx
from src.YOLO_V8 import logger
from src.YOLO_V8.utils.common import get_size, join_path, copy_files

In [27]:
class DataProcessing:
    def __init__(self, config: DataProcessingConfig):
        self.config = config
        self.image_width = int(self.config.image_size.split(",")[0])
        self.image_height = int(self.config.image_size.split(",")[1])
        self.train_data_size = self.config.train_data_size
        self.val_data_size = self.config.val_data_size
    
    def get_raw_data(self):
        source_dir = self.config.source_dir
        target_dir = self.config.raw_data_dir

        for groups in os.listdir(source_dir):
            groups_path = os.path.join(source_dir, groups)
            
            for folder in os.listdir(groups_path):

                files_source_dir = join_path(groups_path,folder)

                files_target_dir = join_path(target_dir, folder)

                create_directories([files_target_dir])

                files = os.listdir(files_source_dir)

                logger.info(f"Start Getting Raw {folder} from {groups_path}.")
                copy_files(files, files_source_dir, files_target_dir, file_extension = None)

    def get_proper_coordinate_line(self, line):
        coordinates = line.split(" ")
        box_class = coordinates[-1].split("\n")[0]

        coordinates = list(map(int,coordinates[:-1]))

        W = self.image_width
        H = self.image_height

        converted_coordinates = pbx.convert_bbox(coordinates, from_type="voc", to_type="yolo",
                                                image_size=(W,H))
        converted_coordinates = list(map(str,converted_coordinates))

        converted_coordinates.insert(0,box_class)
        result_coordinate_line = " ".join(converted_coordinates) + "\n"
        
        return result_coordinate_line
    
    def get_processed_labels(self, labels_source_dir, labels_target_dir):

        labels_file_count = 0
        for labels_file in os.listdir(labels_source_dir):
            labels_source_file_path = join_path(labels_source_dir, labels_file)
            labels_target_file_path = join_path(labels_target_dir, labels_file)

            if not os.path.isdir(labels_target_file_path) or get_size(labels_source_file_path) != get_size(labels_target_file_path):
                label_source_file = open(labels_source_file_path, "r")
                label_target_file = open(labels_target_file_path, "w")
                save_lines = []
                for lines in label_source_file.readlines():
                    processed_line = self.get_proper_coordinate_line(lines)
                    save_lines.append(processed_line)
                
                label_target_file.writelines(save_lines)
                label_source_file.close()
                label_target_file.close()

                labels_file_count = labels_file_count + 1
        
        logger.info(f"{labels_file_count} labels files created from {labels_source_dir} to {labels_target_dir}.")

    
    def processed_data(self):
        source_dir = self.config.raw_data_dir 
        target_dir = self.config.processed_data_dir 

        for folder in os.listdir(source_dir):

            if folder == "images":
                image_source_dir = join_path(source_dir, folder)
                image_target_dir = join_path(target_dir, folder)
                
                create_directories([image_target_dir])

                image_files = os.listdir(image_source_dir)

                logger.info(f"Start Getting Processed {folder} from {image_source_dir}.")

                copy_files(image_files, image_source_dir, image_target_dir, file_extension= None)
                

            if folder == "labels":
                labels_source_dir = join_path(source_dir, folder)
                labels_target_dir = join_path(target_dir, folder)

                create_directories([labels_target_dir])

                logger.info(f"Start Getting Processed {folder} from {labels_source_dir}")
                
                self.get_processed_labels(labels_source_dir, labels_target_dir)
        
    def get_file_names(self, directory):
        file_name_list = []
        for files in os.listdir(directory):
            file_name = files.split(".")[0]
            file_name_list.append(file_name)
    
        return file_name_list
    
    
    def split_data(self):
        source_dir = self.config.processed_data_dir 
        target_dir = self.config.split_data_dir

        
        files_names = self.get_file_names(join_path(source_dir, "images"))

        random.shuffle(files_names)

        train_dir = join_path(target_dir, "train_data")
        if os.path.isdir(train_dir):
            shutil.rmtree(train_dir, ignore_errors=True)

        val_dir = join_path(target_dir, "val_data")
        if os.path.isdir(val_dir):
            shutil.rmtree(val_dir, ignore_errors=True)
        
        create_directories([train_dir, val_dir])
        
        train_size = int((len(files_names) * self.train_data_size))
        val_size = int(len(files_names) * self.val_data_size)

        logger.info(f"Traning files = {train_size} and Validation files = {val_size}")

        for folders in os.listdir(source_dir):
            
            if folders == "images":
                file_extension = ".jpg"
            if folders == "labels":
                file_extension = ".txt"
            
            files_source_dir = join_path(source_dir, folders)
            files_train_dir = join_path(train_dir, folders)
            files_val_dir = join_path(val_dir, folders)

            create_directories([files_train_dir, files_val_dir])

            logger.info(f"Splitting {folders} files into {files_train_dir} and {files_val_dir}.")

            logger.info(f"Creating training files ")
            copy_files(files_names[:train_size], files_source_dir, files_train_dir, file_extension)

            logger.info(f"Creating validation files ")
            copy_files(files_names[train_size:], files_source_dir, files_val_dir, file_extension)



In [28]:

try:
    config = ConfigurationManager()
    data_processing_config = config.get_data_process_config()
    data_processing = DataProcessing(data_processing_config)
    
    logger.info("Gettinng Raw Dataset ---->")
    data_processing.get_raw_data()

    logger.info("Getting Processed Dataset ---->")
    data_processing.processed_data()

    logger.info("Getting Split Dataset ---->")
    data_processing.split_data()

except Exception as e:
    raise e

[2023-07-09 13:14:38,800: INFO: common: yaml file: config/config.yaml loaded successfully]
[2023-07-09 13:14:38,803: INFO: common: yaml file: params.yaml loaded successfully]
[2023-07-09 13:14:38,805: INFO: 2804130089: Gettinng Raw Dataset ---->]
[2023-07-09 13:14:38,808: INFO: common: created directory at: src/YOLO_V8/data/raw_data]
[2023-07-09 13:14:38,811: INFO: common: created directory at: src/YOLO_V8/data/raw_data/labels]
[2023-07-09 13:14:38,813: INFO: 195688972: Start Getting Raw labels from artifacts/data_ingestion/PCBData/group00041.]
[2023-07-09 13:14:38,896: INFO: common: 221 Files copied from artifacts/data_ingestion/PCBData/group00041/labels to src/YOLO_V8/data/raw_data/labels.]
[2023-07-09 13:14:38,899: INFO: common: created directory at: src/YOLO_V8/data/raw_data/images]
[2023-07-09 13:14:38,902: INFO: 195688972: Start Getting Raw images from artifacts/data_ingestion/PCBData/group00041.]
[2023-07-09 13:14:39,037: INFO: common: 221 Files copied from artifacts/data_ingest