In [None]:
import os
from PIL import Image
from torchvision import transforms



import os
import torch
from PIL import Image
from torchvision import transforms

# Paths
input_base = "../data/raw/images/Celeb_V2"
output_base = "../data/processed/Celeb_V2"
os.makedirs(output_base, exist_ok=True)

splits = ["Train", "Val", "Test"]
for split in splits:
    for label in ["real", "fake"]:
        os.makedirs(os.path.join(output_base, split, label), exist_ok=True)



img_size = 224

preprocess_transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),   # Resize to 224x224
    transforms.ToTensor(),                     # Convert to tensor
    transforms.Normalize([0.485, 0.456, 0.406],  # ImageNet mean
                         [0.229, 0.224, 0.225])  # ImageNet std
])


# Example: load one image and apply preprocessing
img_path = "../data/raw/images/Celeb_V2/Train/real/00000_face_1.jpg"  # adjust path
image = Image.open(img_path).convert("RGB")  # ensure RGB
image_tensor = preprocess_transform(image)

print("Image tensor shape:", image_tensor.shape)
print("Tensor min/max:", image_tensor.min().item(), image_tensor.max().item())



for split in splits:
    for label in ["real", "fake"]:
        in_folder = os.path.join(input_base, split, label)
        out_folder = os.path.join(output_base, split, label)

        for img_name in os.listdir(in_folder):
            try:
                img_path = os.path.join(in_folder, img_name)
                img = Image.open(img_path).convert("RGB")
                img_tensor = preprocess_transform(img)
                
                # Save as .jpg file
                save_path = os.path.join(out_folder, img_name.split('.')[0] + ".jpg")
                torch.save(img_tensor, save_path)
            
            except Exception as e:
                print(f"Skipping {img_path}: {e}")



from torch.utils.data import Dataset, DataLoader



# Custom Dataset class
class DeepfakeDataset(Dataset):
    def __init__(self, data_dir, split, transform=None):
        """
        Args:
            data_dir (str): Base directory containing the processed images (e.g., '../data/processed/Celeb_V2').
            split (str): Data split ('Train', 'Val', or 'Test').
            transform (callable, optional): Optional transform to be applied to the loaded tensors.
        """
        self.data_dir = os.path.join(data_dir, split)
        self.transform = transform
        self.images = []
        self.labels = []

        # Define label mapping: 'real' -> 0, 'fake' -> 1
        self.label_map = {'real': 0, 'fake': 1}

        # Load image paths and labels
        for label in ['real', 'fake']:
            folder = os.path.join(self.data_dir, label)
            for img_name in os.listdir(folder):
                if img_name.endswith('.jpg'):  # Assuming saved tensors have .jpg extension
                    self.images.append(os.path.join(folder, img_name))
                    self.labels.append(self.label_map[label])

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        # Load the preprocessed tensor
        img_path = self.images[idx]
        img_tensor = torch.load(img_path)  # Load the saved tensor
        label = self.labels[idx]

        # Apply additional transforms if provided
        if self.transform:
            img_tensor = self.transform(img_tensor)

        return img_tensor, label

# Parameters
batch_size = 32
num_workers = 4  # Adjust based on your system's capabilities
data_dir = '../data/processed/Celeb_V2'

# Create datasets for each split
train_dataset = DeepfakeDataset(data_dir=data_dir, split='Train')
val_dataset = DeepfakeDataset(data_dir=data_dir, split='Val')
test_dataset = DeepfakeDataset(data_dir=data_dir, split='Test')

# Create DataLoaders
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,  # Shuffle for training
    num_workers=num_workers,
    pin_memory=True  # Faster data transfer to GPU if available
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,  # No need to shuffle for validation
    num_workers=num_workers,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,  # No need to shuffle for testing
    num_workers=num_workers,
    pin_memory=True
)

# Example: Iterate through the DataLoader
for batch_idx, (images, labels) in enumerate(train_loader):
    print(f"Batch {batch_idx}:")
    print(f"Image batch shape: {images.shape}")  # Should be [batch_size, 3, 224, 224]
    print(f"Label batch shape: {labels.shape}")  # Should be [batch_size]
    break  # Print only the first batch for demonstration

FileNotFoundError: [WinError 3] The system cannot find the path specified: '../data/raw/images/FaceForensics++_C23\\original'