In [None]:

from google.colab import drive
import os
drive.mount('/content/drive')


NEW_MODEL_DIR = '/content/drive/My Drive/DeepFakeDataset/NEW MODELS/NEW'

os.makedirs(NEW_MODEL_DIR, exist_ok=True)
print(f"‚úÖ Models will be saved in: {NEW_MODEL_DIR}")

Mounted at /content/drive
‚úÖ Models will be saved in: /content/drive/My Drive/DeepFakeDataset/NEW MODELS/NEW


In [None]:
# Cell 2: Unzip BOTH Datasets
import os

print("--- Step 1: Unzipping Datasets ---")


DRIVE_ZIP_PATH_REAL = '/content/drive/My Drive/DeepFakeDataset/140k-real-and-fake-faces.zip'
LOCAL_DATA_PATH_REAL = '/content/dataset_140k'


DRIVE_ZIP_PATH_FAKE = '/content/drive/My Drive/DeepFakeDataset/openjourney.zip'
LOCAL_DATA_PATH_FAKE = '/content/dataset_openjourney'


if not os.path.exists(os.path.join(LOCAL_DATA_PATH_REAL, 'real_vs_fake')):
    print("Unzipping 140k Real dataset...")
    !rm -rf "{LOCAL_DATA_PATH_REAL}"
    os.makedirs(LOCAL_DATA_PATH_REAL, exist_ok=True)
    !unzip -q "{DRIVE_ZIP_PATH_REAL}" -d "{LOCAL_DATA_PATH_REAL}"
else:
    print("140k Real dataset already unzipped.")

if not os.path.exists(os.path.join(LOCAL_DATA_PATH_FAKE, 'openjourney')):
    print("Unzipping openjourney FAKE dataset...")
    !rm -rf "{LOCAL_DATA_PATH_FAKE}"
    os.makedirs(LOCAL_DATA_PATH_FAKE, exist_ok=True)
    !unzip -q "{DRIVE_ZIP_PATH_FAKE}" -d "{LOCAL_DATA_PATH_FAKE}"
else:
    print("openjourney FAKE dataset already unzipped.")

print("‚úÖ All data ready for training.\n")

--- Step 1: Unzipping Datasets ---
Unzipping 140k Real dataset...
Unzipping openjourney FAKE dataset...
‚úÖ All data ready for training.



In [None]:

print("--- Verifying top 50 lines of FAKE dataset ---")
!ls -lR '/content/dataset_openjourney' | head -n 50

--- Verifying top 50 lines of FAKE dataset ---
/content/dataset_openjourney:
total 568
drwxrwxrwx 2 root root 577536 Nov  1 16:07 openjourney

/content/dataset_openjourney/openjourney:
total 5740076
-rw-rw-rw- 1 root root 399534 Nov  1 16:04 image_10_0_0.png
-rw-rw-rw- 1 root root 367156 Nov  1 16:04 image_10_0_100.png
-rw-rw-rw- 1 root root 437729 Nov  1 16:04 image_10_0_101.png
-rw-rw-rw- 1 root root 401984 Nov  1 16:04 image_10_0_102.png
-rw-rw-rw- 1 root root 461694 Nov  1 16:04 image_10_0_103.png
-rw-rw-rw- 1 root root 415023 Nov  1 16:04 image_10_0_104.png
-rw-rw-rw- 1 root root 391276 Nov  1 16:04 image_10_0_105.png
-rw-rw-rw- 1 root root 442336 Nov  1 16:04 image_10_0_106.png
-rw-rw-rw- 1 root root 346818 Nov  1 16:04 image_10_0_107.png
-rw-rw-rw- 1 root root 305512 Nov  1 16:04 image_10_0_108.png
-rw-rw-rw- 1 root root 544811 Nov  1 16:04 image_10_0_109.png
-rw-rw-rw- 1 root root 435849 Nov  1 16:04 image_10_0_10.png
-rw-rw-rw- 1 root root 436681 Nov  1 16:04 image_10_0_110.pn

In [None]:
# Cell 4: Define the ImageDataset Class
import torch
from torch.utils.data import Dataset
import cv2

class ImageDataset(Dataset):
    def __init__(self, data_list, transform=None):
        self.data_list = data_list
        self.transform = transform

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        img_path, label = self.data_list[idx]
        img = cv2.imread(img_path)


        if img is None:
            print(f"Warning: Could not read image {img_path}. Skipping.")

            return torch.zeros((3, 224, 224)), torch.tensor(label, dtype=torch.long)

        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        if self.transform:
            img = self.transform(img)


        return img, torch.tensor(label, dtype=torch.long)

print("ImageDataset class defined.")

ImageDataset class defined.


In [None]:
# Cell 5: Define the SimpleCNN Model Architecture (with Regularization)
import torch.nn as nn

class SimpleCNN(nn.Module):
    def __init__(self, num_classes=2):
        super(SimpleCNN, self).__init__()


        self.conv_layers = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.flatten = nn.Flatten()


        self.fc_layers = nn.Sequential(
            nn.Linear(32 * 56 * 56, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.flatten(x)
        x = self.fc_layers(x)
        return x

print("SimpleCNN model class defined.")

SimpleCNN model class defined.


In [None]:
# Cell 6: Create the Data Loaders
import glob
from torchvision import transforms
from torch.utils.data import DataLoader
import os
import random
from sklearn.model_selection import train_test_split

print("\n--- Step 2: Preparing Data Loaders ---")


REAL_DATA_PATH = '/content/dataset_140k/real_vs_fake/real-vs-fake'
real_files = glob.glob(os.path.join(REAL_DATA_PATH, 'train/real', '*.jpg')) + \
             glob.glob(os.path.join(REAL_DATA_PATH, 'valid/real', '*.jpg'))
print(f"Found {len(real_files)} total REAL images.")


FAKE_DATA_PATH = '/content/dataset_openjourney/openjourney'
fake_files = glob.glob(os.path.join(FAKE_DATA_PATH, '*.png'))
print(f"Found {len(fake_files)} total FAKE (openjourney) images.")


if len(fake_files) == 0:
    print("‚ùå ERROR: No fake files found. Cannot continue.")
    print(f"Please check the path: {FAKE_DATA_PATH}")
else:
    real_files_balanced = random.sample(real_files, len(fake_files))
    print(f"Balancing dataset: Using {len(real_files_balanced)} REAL images.")


    all_files_list = [(path, 0) for path in fake_files] + \
                     [(path, 1) for path in real_files_balanced]

    labels = [label for path, label in all_files_list]
    train_list, valid_list = train_test_split(
        all_files_list,
        test_size=0.20,
        random_state=42,
        stratify=labels
    )

    print(f"Found {len(all_files_list)} total balanced images.")
    print(f"Split into {len(train_list)} training images.")
    print(f"Split into {len(valid_list)} validation images.")


    im_size = 224
    mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]


    train_transforms = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((im_size, im_size)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(15),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])


    valid_transforms = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((im_size, im_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])

    train_data = ImageDataset(train_list, transform=train_transforms)
    valid_data = ImageDataset(valid_list, transform=valid_transforms)

    train_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=2)
    valid_loader = DataLoader(valid_data, batch_size=32, shuffle=False, num_workers=2)

    print("‚úÖ Data loaders are ready.\n")


--- Step 2: Preparing Data Loaders ---
Found 60000 total REAL images.
Found 14204 total FAKE (openjourney) images.
Balancing dataset: Using 14204 REAL images.
Found 28408 total balanced images.
Split into 22726 training images.
Split into 5682 validation images.
‚úÖ Data loaders are ready.



In [None]:
# Cell 7: Train the Model (CHANGED: New Path, Weight Decay, Early Stopping)
import torch.optim as optim
from tqdm import tqdm
import time

print("--- Step 3: Setting Up for Training ---")
device = 'cuda' if torch.cuda.is_available() else 'cpu'


MODEL_DIR = '/content/drive/My Drive/DeepFakeDataset/NEW MODELS/NEW'


MODEL_NAME = 'openjourney-simple-cnn-v3-full'

BEST_MODEL_PATH = os.path.join(MODEL_DIR, f'{MODEL_NAME}_best_model.pth')
CHECKPOINT_PATH = os.path.join(MODEL_DIR, f'{MODEL_NAME}_checkpoint.pth')

print(f"Models and checkpoints will be saved in: {MODEL_DIR}")


model = SimpleCNN().to(device)
lr = 1e-4
num_epochs = 10


optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()


patience = 4
epochs_no_improve = 0
best_valid_acc = 0.0
start_epoch = 0

if os.path.exists(CHECKPOINT_PATH):
    checkpoint = torch.load(CHECKPOINT_PATH)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    best_valid_acc = checkpoint.get('best_valid_acc', 0.0)
    epochs_no_improve = checkpoint.get('epochs_no_improve', 0)
    print(f"‚úÖ Checkpoint found. Resuming training from epoch {start_epoch}")
else:
    print("‚ÑπÔ∏è No checkpoint found. Starting training from scratch.")

print(f"Training for {num_epochs} total epochs on device: {device}\n")


total_train_time = 0.0

for epoch in range(start_epoch, num_epochs):
    epoch_start_time = time.time()


    model.train()
    running_loss = 0.0
    running_corrects = 0

    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        _, preds = torch.max(outputs, 1)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)

    train_loss = running_loss / len(train_loader.dataset)
    train_acc = running_corrects.double() / len(train_loader.dataset)

    model.eval()
    running_loss = 0.0
    running_corrects = 0

    with torch.no_grad():
        for inputs, labels in tqdm(valid_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Valid]"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

    valid_loss = running_loss / len(valid_loader.dataset)
    valid_acc = running_corrects.double() / len(valid_loader.dataset)

    epoch_time = time.time() - epoch_start_time
    total_train_time += epoch_time

    print(f"Epoch {epoch+1}/{num_epochs} ({epoch_time:.2f}s) | Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | Valid Loss: {valid_loss:.4f} Acc: {valid_acc:.4f}")


    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(), BEST_MODEL_PATH)
        print(f"üéâ New best model saved with accuracy: {best_valid_acc:.4f}")
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        print(f"Validation accuracy did not improve. Patience: {epochs_no_improve}/{patience}")


    checkpoint_data = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'best_valid_acc': best_valid_acc,
        'epochs_no_improve': epochs_no_improve
    }
    torch.save(checkpoint_data, CHECKPOINT_PATH)
    print(f"üíæ Checkpoint saved for epoch {epoch+1}.\n")

    # --- NEW: Check for early stopping ---
    if epochs_no_improve >= patience:
        print(f"--- üõë Early stopping triggered after {patience} epochs with no improvement. ---")
        break


print(f"--- TRAINING COMPLETE ---")
print(f"Total training time: {total_train_time:.2f} seconds")
print(f"Best model saved to: {BEST_MODEL_PATH}")

--- Step 3: Setting Up for Training ---
Models and checkpoints will be saved in: /content/drive/My Drive/DeepFakeDataset/NEW MODELS/NEW
‚ÑπÔ∏è No checkpoint found. Starting training from scratch.
Training for 10 total epochs on device: cuda



Epoch 1/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 711/711 [03:31<00:00,  3.36it/s]
Epoch 1/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 178/178 [00:49<00:00,  3.62it/s]


Epoch 1/10 (260.95s) | Train Loss: 0.1384 Acc: 0.9544 | Valid Loss: 0.0516 Acc: 0.9808
üéâ New best model saved with accuracy: 0.9808
üíæ Checkpoint saved for epoch 1.



Epoch 2/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 711/711 [03:24<00:00,  3.47it/s]
Epoch 2/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 178/178 [00:49<00:00,  3.61it/s]


Epoch 2/10 (254.14s) | Train Loss: 0.0640 Acc: 0.9782 | Valid Loss: 0.0535 Acc: 0.9836
üéâ New best model saved with accuracy: 0.9836
üíæ Checkpoint saved for epoch 2.



Epoch 3/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 711/711 [03:28<00:00,  3.41it/s]
Epoch 3/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 178/178 [00:47<00:00,  3.73it/s]


Epoch 3/10 (256.14s) | Train Loss: 0.0520 Acc: 0.9828 | Valid Loss: 0.0394 Acc: 0.9866
üéâ New best model saved with accuracy: 0.9866
üíæ Checkpoint saved for epoch 3.



Epoch 4/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 711/711 [03:27<00:00,  3.42it/s]
Epoch 4/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 178/178 [00:49<00:00,  3.58it/s]


Epoch 4/10 (257.49s) | Train Loss: 0.0452 Acc: 0.9846 | Valid Loss: 0.0349 Acc: 0.9887
üéâ New best model saved with accuracy: 0.9887
üíæ Checkpoint saved for epoch 4.



Epoch 5/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 711/711 [03:24<00:00,  3.48it/s]
Epoch 5/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 178/178 [00:49<00:00,  3.60it/s]


Epoch 5/10 (253.60s) | Train Loss: 0.0365 Acc: 0.9870 | Valid Loss: 0.0280 Acc: 0.9887
Validation accuracy did not improve. Patience: 1/4
üíæ Checkpoint saved for epoch 5.



Epoch 6/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 711/711 [03:22<00:00,  3.52it/s]
Epoch 6/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 178/178 [00:49<00:00,  3.62it/s]


Epoch 6/10 (251.41s) | Train Loss: 0.0364 Acc: 0.9880 | Valid Loss: 0.0248 Acc: 0.9908
üéâ New best model saved with accuracy: 0.9908
üíæ Checkpoint saved for epoch 6.



Epoch 7/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 711/711 [03:26<00:00,  3.45it/s]
Epoch 7/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 178/178 [00:47<00:00,  3.72it/s]


Epoch 7/10 (253.88s) | Train Loss: 0.0285 Acc: 0.9890 | Valid Loss: 0.0247 Acc: 0.9908
Validation accuracy did not improve. Patience: 1/4
üíæ Checkpoint saved for epoch 7.



Epoch 8/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 711/711 [03:21<00:00,  3.54it/s]
Epoch 8/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 178/178 [00:48<00:00,  3.68it/s]


Epoch 8/10 (249.50s) | Train Loss: 0.0283 Acc: 0.9905 | Valid Loss: 0.0210 Acc: 0.9933
üéâ New best model saved with accuracy: 0.9933
üíæ Checkpoint saved for epoch 8.



Epoch 9/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 711/711 [03:19<00:00,  3.56it/s]
Epoch 9/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 178/178 [00:47<00:00,  3.72it/s]


Epoch 9/10 (247.88s) | Train Loss: 0.0279 Acc: 0.9901 | Valid Loss: 0.0220 Acc: 0.9940
üéâ New best model saved with accuracy: 0.9940
üíæ Checkpoint saved for epoch 9.



Epoch 10/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 711/711 [03:19<00:00,  3.56it/s]
Epoch 10/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 178/178 [00:46<00:00,  3.84it/s]


Epoch 10/10 (246.32s) | Train Loss: 0.0249 Acc: 0.9918 | Valid Loss: 0.0255 Acc: 0.9912
Validation accuracy did not improve. Patience: 1/4
üíæ Checkpoint saved for epoch 10.

--- TRAINING COMPLETE ---
Total training time: 2531.31 seconds
Best model saved to: /content/drive/My Drive/DeepFakeDataset/NEW MODELS/NEW/openjourney-simple-cnn-v3-full_best_model.pth
