In [1]:
%pwd

'c:\\Users\\ENGRACED\\Documents\\Project\\datascience\\farm_copilot\\training\\farm_copilot_model_trainer\\research'

In [2]:
import os
from pathlib import Path

In [3]:
os.chdir('..')

In [4]:
%pwd

'c:\\Users\\ENGRACED\\Documents\\Project\\datascience\\farm_copilot\\training\\farm_copilot_model_trainer'

In [5]:
from dataclasses import dataclass

@dataclass(frozen=True)
class PrepareDataConfig:
    training_data: Path

In [6]:
from farm_copilot.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from farm_copilot.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH,          
        ) -> None:
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
    
    def get_prepare_data_config(self) -> PrepareDataConfig:
        training_data = Path(self.config.data_ingestion.local_data_file)

        prepare_data_config = PrepareDataConfig(training_data=training_data)

        return prepare_data_config




In [8]:
from PIL import Image

In [9]:
class PrepareData:
    def __init__(self, config: PrepareDataConfig) -> None:
        self.config = config    
    

    def resize_image(input_image_path, output_image_path, size=(224,224)):
        original_image = Image.open(input_image_path)
        resize_image = original_image.resize(size)
        resize_image.save(output_image_path)


    def is_valid_jpeg(image_path):
        try:
            img = Image.open(image_path)
            img.verify()
            return True
        except (IOError, SyntaxError):
            return False


    def clean(self):      
        valid_ext = ['.JPG', '.PNG', '.GIF', '.BMP', '.jpg', '.png', '.gif', '.bmp']


        for (root, dir, files) in os.walk(self.config.training_data):
            for file in files:
                _, ext = os.path.splitext(p=os.path.join(root, file))
                if ext not in valid_ext:
                    os.remove(os.path.join(root, file))
                    log(f"{file} with extension {ext} removed. Reason: Invalid extension")
                    continue
                
                image = os.path.join(root,file)
                if not self.is_valid_jpeg(image_path=image):
                    os.remove(image)
                    print(f"{image} removed. Reason: Corrupted")
                    continue
                
                try:
                    self.resize_image(input_image_path=image, output_image_path=image)
                except (IOError, SyntaxError):
                    os.remove(image)
                    print(f"{image} removed. Reason: Failed to resize")

In [10]:
try:
    config = ConfigurationManager()
    prepare_data_config = config.get_prepare_data_config()
    prepare_data = PrepareData(config=prepare_data_config)
    prepare_data.clean()
    
except Exception as e:
    raise e

[2024-03-27 19:33:14,806: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-03-27 19:33:14,816: INFO: common: yaml file: params.yaml loaded successfully]
healthy189_.jpg removed. Reason: Corrupted
healthy87_.jpg removed. Reason: Corrupted
leaf blight379_.jpg removed. Reason: Corrupted
leaf blight58_.jpg removed. Reason: Corrupted
leaf blight82_.jpg removed. Reason: Corrupted
leaf spot271_.jpg removed. Reason: Corrupted
leaf spot424_.jpg removed. Reason: Corrupted
leaf spot649_.jpg removed. Reason: Corrupted
leaf spot798_.jpg removed. Reason: Corrupted
leaf spot957_.jpg removed. Reason: Corrupted
streak virus421_.jpg removed. Reason: Corrupted
streak virus485_.jpg removed. Reason: Corrupted
streak virus773_.jpg removed. Reason: Corrupted
leaf beetle325_.jpg removed. Reason: Corrupted
leaf beetle457_.jpg removed. Reason: Corrupted
leaf beetle572_.jpg removed. Reason: Corrupted
leaf beetle68_.jpg removed. Reason: Corrupted
healthy76_.jpg removed. Reason: Corrupted
d0b