In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

In [2]:
DATA_DIR = "../data/processed"
SPLIT_DIR = "../data/split"

# create split dirs
for split in ["train", "val", "test"]:
    for cls in ["real", "fake"]:
        os.makedirs(os.path.join(SPLIT_DIR, split, cls), exist_ok=True)


In [3]:
# Train-val-test split
def split_dataset(src_dir, dst_dir, train_ratio=0.7, val_ratio=0.15):
    for cls in ["real", "fake"]:
        files = os.listdir(os.path.join(src_dir, cls))
        
        # train/test/val split
        train_files, temp_files = train_test_split(files, test_size=(1-train_ratio), random_state=42)
        val_files, test_files = train_test_split(temp_files, test_size=0.5, random_state=42)

        # copy files
        for split, split_files in zip(["train", "val", "test"], [train_files, val_files, test_files]):
            for f in split_files:
                src = os.path.join(src_dir, cls, f)
                dst = os.path.join(dst_dir, split, cls, f)
                shutil.copy(src, dst)

split_dataset(DATA_DIR, SPLIT_DIR)
print("Data splitted into train/val/test")


Data splitted into train/val/test


In [4]:
#Transforms
transform_train = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

transform_eval = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])


In [5]:
train_dataset = datasets.ImageFolder(os.path.join(SPLIT_DIR, "train"), transform=transform_train)
val_dataset   = datasets.ImageFolder(os.path.join(SPLIT_DIR, "val"), transform=transform_eval)
test_dataset  = datasets.ImageFolder(os.path.join(SPLIT_DIR, "test"), transform=transform_eval)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

print(f"Train size: {len(train_dataset)} | Val size: {len(val_dataset)} | Test size: {len(test_dataset)}")


FileNotFoundError: Found no valid file for the classes fake, real. Supported extensions are: .jpg, .jpeg, .png, .ppm, .bmp, .pgm, .tif, .tiff, .webp

In [6]:
try:
    train_dataset = datasets.ImageFolder(os.path.join(SPLIT_DIR, "train"), transform=transform_train)
    val_dataset = datasets.ImageFolder(os.path.join(SPLIT_DIR, "val"), transform=transform_eval)
    test_dataset = datasets.ImageFolder(os.path.join(SPLIT_DIR, "test"), transform=transform_eval)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

    print(f"Train size: {len(train_dataset)} | Val size: {len(val_dataset)} | Test size: {len(test_dataset)}")
except Exception as e:
    print(f"Error loading datasets: {e}")


Error loading datasets: Found no valid file for the classes fake, real. Supported extensions are: .jpg, .jpeg, .png, .ppm, .bmp, .pgm, .tif, .tiff, .webp
