In [1]:
# Cell 1: Mount Google Drive & Define Path
from google.colab import drive
import os
drive.mount('/content/drive')

# --- NEW: Define the new save directory ---
# As you confirmed, this path already exists
NEW_MODEL_DIR = '/content/drive/My Drive/DeepFakeDataset/NEW MODELS/NEW'
print(f"‚úÖ Models will be saved in: {NEW_MODEL_DIR}")

Mounted at /content/drive
‚úÖ Models will be saved in: /content/drive/My Drive/DeepFakeDataset/NEW MODELS/NEW


In [None]:
# Cell 2: Unzip BOTH Datasets
import os

print("--- Step 1: Unzipping Datasets ---")

# --- Path to your REAL images dataset ---
DRIVE_ZIP_PATH_REAL = '/content/drive/My Drive/DeepFakeDataset/140k-real-and-fake-faces.zip'
LOCAL_DATA_PATH_REAL = '/content/dataset_140k'

# --- Path to your FAKE (min-dalle) images dataset ---
DRIVE_ZIP_PATH_FAKE = '/content/drive/My Drive/DeepFakeDataset/min-dalle.zip'
LOCAL_DATA_PATH_FAKE = '/content/dataset_min-dalle'

# Unzip REAL dataset (if not already done)
if not os.path.exists(os.path.join(LOCAL_DATA_PATH_REAL, 'real_vs_fake')):
    print("Unzipping 140k Real dataset...")
    !rm -rf "{LOCAL_DATA_PATH_REAL}"
    os.makedirs(LOCAL_DATA_PATH_REAL, exist_ok=True)
    !unzip -q "{DRIVE_ZIP_PATH_REAL}" -d "{LOCAL_DATA_PATH_REAL}"
else:
    print("140k Real dataset already unzipped.")

# Unzip FAKE dataset (if not already done)
if not os.path.exists(os.path.join(LOCAL_DATA_PATH_FAKE, 'min-dalle')):
    print("Unzipping min-dalle FAKE dataset...")
    !rm -rf "{LOCAL_DATA_PATH_FAKE}"
    os.makedirs(LOCAL_DATA_PATH_FAKE, exist_ok=True)
    !unzip -q "{DRIVE_ZIP_PATH_FAKE}" -d "{LOCAL_DATA_PATH_FAKE}"
else:
    print("min-dalle FAKE dataset already unzipped.")

print("‚úÖ All data ready for training.\n")

--- Step 1: Unzipping Datasets ---
Unzipping 140k Real dataset...
Unzipping min-dalle FAKE dataset...
‚úÖ All data ready for training.



In [None]:
# Cell 3: VERIFY Unzipped Structure
print("--- Verifying top 50 lines of FAKE dataset ---")
!ls -lR '/content/dataset_min-dalle' | head -n 50

--- Verifying top 50 lines of FAKE dataset ---
/content/dataset_min-dalle:
total 468
drwxrwxrwx 2 root root 475136 Nov  1 16:04 min-dalle

/content/dataset_min-dalle/min-dalle:
total 1211996
-rw-rw-rw- 1 root root  99587 Nov  1 16:02 image_1_0_0.png
-rw-rw-rw- 1 root root 108529 Nov  1 16:02 image_1_0_100.png
-rw-rw-rw- 1 root root 121898 Nov  1 16:02 image_1_0_101.png
-rw-rw-rw- 1 root root  96017 Nov  1 16:02 image_1_0_102.png
-rw-rw-rw- 1 root root 103403 Nov  1 16:02 image_1_0_103.png
-rw-rw-rw- 1 root root 156462 Nov  1 16:02 image_1_0_104.png
-rw-rw-rw- 1 root root 118729 Nov  1 16:02 image_1_0_105.png
-rw-rw-rw- 1 root root 130406 Nov  1 16:02 image_1_0_106.png
-rw-rw-rw- 1 root root 110302 Nov  1 16:02 image_1_0_107.png
-rw-rw-rw- 1 root root 108706 Nov  1 16:02 image_1_0_108.png
-rw-rw-rw- 1 root root  90299 Nov  1 16:02 image_1_0_109.png
-rw-rw-rw- 1 root root 117343 Nov  1 16:02 image_1_0_10.png
-rw-rw-rw- 1 root root 146733 Nov  1 16:02 image_1_0_110.png
-rw-rw-rw- 1 root r

In [None]:
# Cell 4: Define the ImageDataset Class
import torch
from torch.utils.data import Dataset
import cv2

class ImageDataset(Dataset):
    def __init__(self, data_list, transform=None):
        self.data_list = data_list
        self.transform = transform

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        img_path, label = self.data_list[idx]
        img = cv2.imread(img_path)

        # Handle images that might fail to load
        if img is None:
            print(f"Warning: Could not read image {img_path}. Skipping.")
            # Return a blank image and the label
            return torch.zeros((3, 224, 224)), torch.tensor(label, dtype=torch.long)

        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        if self.transform:
            img = self.transform(img)

        # CrossEntropyLoss requires labels as LongTensor
        return img, torch.tensor(label, dtype=torch.long)

print("ImageDataset class defined.")

ImageDataset class defined.


In [None]:
# Cell 5: Define the SimpleCNN Model Architecture (with Regularization)
import torch.nn as nn

class SimpleCNN(nn.Module):
    def __init__(self, num_classes=2):
        super(SimpleCNN, self).__init__()

        # --- CHANGED: Added BatchNorm layers ---
        self.conv_layers = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(16), # Helps with generalization
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32), # Helps with generalization
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.flatten = nn.Flatten()

        # --- CHANGED: Added Dropout layer ---
        self.fc_layers = nn.Sequential(
            nn.Linear(32 * 56 * 56, 128),
            nn.ReLU(),
            nn.Dropout(0.5), # "Turns off" 50% of neurons to prevent co-adaptation
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.flatten(x)
        x = self.fc_layers(x)
        return x

print("SimpleCNN model class defined.")

SimpleCNN model class defined.


In [None]:
# Cell 6: Create the Data Loaders (Balanced, NO Augmentation - FASTEST)
import glob
from torchvision import transforms
from torch.utils.data import DataLoader
import os
import random
from sklearn.model_selection import train_test_split

print("\n--- Step 2: Preparing Data Loaders ---")

# --- 1. Load REAL files from 140k dataset ---
REAL_DATA_PATH = '/content/dataset_140k/real_vs_fake/real-vs-fake'
real_files = glob.glob(os.path.join(REAL_DATA_PATH, 'train/real', '*.jpg')) + \
             glob.glob(os.path.join(REAL_DATA_PATH, 'valid/real', '*.jpg'))
print(f"Found {len(real_files)} total REAL images.")

# --- 2. Load FAKE files (min-dalle ONLY) ---
FAKE_DATA_PATH = '/content/dataset_min-dalle/min-dalle'
fake_files = glob.glob(os.path.join(FAKE_DATA_PATH, '*.png'))
print(f"Found {len(fake_files)} total FAKE (min-dalle) images.")

# --- 3. Balance the dataset ---
if len(fake_files) == 0:
    print("‚ùå ERROR: No fake files found. Cannot continue.")
    print(f"Please check the path: {FAKE_DATA_PATH}")
else:
    real_files_balanced = random.sample(real_files, len(fake_files))
    print(f"Balancing dataset: Using {len(real_files_balanced)} REAL images.")

    # --- 4. Create master list and split ---
    all_files_list = [(path, 0) for path in fake_files] + \
                     [(path, 1) for path in real_files_balanced]

    labels = [label for path, label in all_files_list]
    train_list, valid_list = train_test_split(
        all_files_list,
        test_size=0.20,
        random_state=42,
        stratify=labels
    )

    print(f"Found {len(all_files_list)} total balanced images.")
    print(f"Split into {len(train_list)} training images.")
    print(f"Split into {len(valid_list)} validation images.")

    # --- 5. Create Transforms (CHANGED: NO Augmentation) ---
    im_size = 224
    mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]

    # --- NEW: NO Augmentation for EITHER set for max speed ---
    # Both train and validation now use the same simple transform
    data_transforms = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((im_size, im_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])

    train_data = ImageDataset(train_list, transform=data_transforms)
    valid_data = ImageDataset(valid_list, transform=data_transforms)

    train_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=2)
    valid_loader = DataLoader(valid_data, batch_size=32, shuffle=False, num_workers=2)

    print("‚úÖ Data loaders are ready.\n")


--- Step 2: Preparing Data Loaders ---
Found 60000 total REAL images.
Found 10969 total FAKE (min-dalle) images.
Balancing dataset: Using 10969 REAL images.
Found 21938 total balanced images.
Split into 17550 training images.
Split into 4388 validation images.
‚úÖ Data loaders are ready.



In [None]:
# Cell 7: Train the Model (CHANGED: New Path, Weight Decay, Early Stopping)
import torch.optim as optim
from tqdm import tqdm
import time

print("--- Step 3: Setting Up for Training ---")
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# --- CHANGED: Using the new path you requested ---
MODEL_DIR = '/content/drive/My Drive/DeepFakeDataset/NEW MODELS/NEW'

# --- CHANGED: Using the v2 name as requested ---
MODEL_NAME = 'min-dalle-simple-cnn-v2'

BEST_MODEL_PATH = os.path.join(MODEL_DIR, f'{MODEL_NAME}_best_model.pth')
CHECKPOINT_PATH = os.path.join(MODEL_DIR, f'{MODEL_NAME}_checkpoint.pth')

print(f"Models and checkpoints will be saved in: {MODEL_DIR}")

# Initialize model and optimizer
model = SimpleCNN().to(device) # This uses the Cell 5 model with Dropout/BatchNorm
lr = 1e-4
num_epochs = 10

# --- CHANGED: Added weight_decay (L2 regularization) ---
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()

# --- NEW: Variables for Early Stopping ---
patience = 7 # Stop if no improvement after 7 epochs
epochs_no_improve = 0
best_valid_acc = 0.0
start_epoch = 0

if os.path.exists(CHECKPOINT_PATH):
    checkpoint = torch.load(CHECKPOINT_PATH)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    best_valid_acc = checkpoint.get('best_valid_acc', 0.0)
    epochs_no_improve = checkpoint.get('epochs_no_improve', 0)
    print(f"‚úÖ Checkpoint found. Resuming training from epoch {start_epoch}")
else:
    print("‚ÑπÔ∏è No checkpoint found. Starting training from scratch.")

print(f"Training for {num_epochs} total epochs on device: {device}\n")

# --- Training loop ---
total_train_time = 0.0

for epoch in range(start_epoch, num_epochs):
    epoch_start_time = time.time()

    # --- Training Phase ---
    model.train()
    running_loss = 0.0
    running_corrects = 0

    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        _, preds = torch.max(outputs, 1)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)

    train_loss = running_loss / len(train_loader.dataset)
    train_acc = running_corrects.double() / len(train_loader.dataset)

    # --- Validation Phase ---
    model.eval()
    running_loss = 0.0
    running_corrects = 0

    with torch.no_grad():
        for inputs, labels in tqdm(valid_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Valid]"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

    valid_loss = running_loss / len(valid_loader.dataset)
    valid_acc = running_corrects.double() / len(valid_loader.dataset)

    epoch_time = time.time() - epoch_start_time
    total_train_time += epoch_time

    print(f"Epoch {epoch+1}/{num_epochs} ({epoch_time:.2f}s) | Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | Valid Loss: {valid_loss:.4f} Acc: {valid_acc:.4f}")

    # --- CHANGED: Updated saving and early stopping logic ---
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(), BEST_MODEL_PATH)
        print(f"üéâ New best model saved with accuracy: {best_valid_acc:.4f}")
        epochs_no_improve = 0 # Reset patience
    else:
        epochs_no_improve += 1
        print(f"Validation accuracy did not improve. Patience: {epochs_no_improve}/{patience}")

    # --- Save checkpoint after every epoch ---
    checkpoint_data = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'best_valid_acc': best_valid_acc,
        'epochs_no_improve': epochs_no_improve # Save patience counter
    }
    torch.save(checkpoint_data, CHECKPOINT_PATH)
    print(f"üíæ Checkpoint saved for epoch {epoch+1}.\n")

    # --- NEW: Check for early stopping ---
    if epochs_no_improve >= patience:
        print(f"--- üõë Early stopping triggered after {patience} epochs with no improvement. ---")
        break


print(f"--- TRAINING COMPLETE ---")
print(f"Total training time: {total_train_time:.2f} seconds")
print(f"Best model saved to: {BEST_MODEL_PATH}")

--- Step 3: Setting Up for Training ---
Models and checkpoints will be saved in: /content/drive/My Drive/DeepFakeDataset/NEW MODELS/NEW
‚ÑπÔ∏è No checkpoint found. Starting training from scratch.
Training for 10 total epochs on device: cuda



Epoch 1/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 549/549 [01:14<00:00,  7.36it/s]
Epoch 1/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 138/138 [00:16<00:00,  8.13it/s]


Epoch 1/10 (91.66s) | Train Loss: 0.1032 Acc: 0.9667 | Valid Loss: 0.0422 Acc: 0.9868
üéâ New best model saved with accuracy: 0.9868
üíæ Checkpoint saved for epoch 1.



Epoch 2/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 549/549 [01:10<00:00,  7.75it/s]
Epoch 2/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 138/138 [00:17<00:00,  7.91it/s]


Epoch 2/10 (88.30s) | Train Loss: 0.0243 Acc: 0.9929 | Valid Loss: 0.0123 Acc: 0.9954
üéâ New best model saved with accuracy: 0.9954
üíæ Checkpoint saved for epoch 2.



Epoch 3/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 549/549 [01:12<00:00,  7.52it/s]
Epoch 3/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 138/138 [00:16<00:00,  8.53it/s]


Epoch 3/10 (89.15s) | Train Loss: 0.0103 Acc: 0.9966 | Valid Loss: 0.0043 Acc: 0.9995
üéâ New best model saved with accuracy: 0.9995
üíæ Checkpoint saved for epoch 3.



Epoch 4/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 549/549 [01:12<00:00,  7.57it/s]
Epoch 4/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 138/138 [00:16<00:00,  8.62it/s]


Epoch 4/10 (88.55s) | Train Loss: 0.0124 Acc: 0.9952 | Valid Loss: 0.0073 Acc: 0.9977
Validation accuracy did not improve. Patience: 1/7
üíæ Checkpoint saved for epoch 4.



Epoch 5/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 549/549 [01:11<00:00,  7.69it/s]
Epoch 5/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 138/138 [00:17<00:00,  7.94it/s]


Epoch 5/10 (88.76s) | Train Loss: 0.0063 Acc: 0.9981 | Valid Loss: 0.0028 Acc: 0.9991
Validation accuracy did not improve. Patience: 2/7
üíæ Checkpoint saved for epoch 5.



Epoch 6/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 549/549 [01:10<00:00,  7.80it/s]
Epoch 6/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 138/138 [00:16<00:00,  8.62it/s]


Epoch 6/10 (86.44s) | Train Loss: 0.0057 Acc: 0.9983 | Valid Loss: 0.0020 Acc: 0.9995
Validation accuracy did not improve. Patience: 3/7
üíæ Checkpoint saved for epoch 6.



Epoch 7/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 549/549 [01:11<00:00,  7.65it/s]
Epoch 7/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 138/138 [00:15<00:00,  8.64it/s]


Epoch 7/10 (87.73s) | Train Loss: 0.0081 Acc: 0.9966 | Valid Loss: 0.0015 Acc: 0.9998
üéâ New best model saved with accuracy: 0.9998
üíæ Checkpoint saved for epoch 7.



Epoch 8/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 549/549 [01:13<00:00,  7.46it/s]
Epoch 8/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 138/138 [00:15<00:00,  8.76it/s]


Epoch 8/10 (89.35s) | Train Loss: 0.0063 Acc: 0.9978 | Valid Loss: 0.0048 Acc: 0.9986
Validation accuracy did not improve. Patience: 1/7
üíæ Checkpoint saved for epoch 8.



Epoch 9/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 549/549 [01:10<00:00,  7.73it/s]
Epoch 9/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 138/138 [00:17<00:00,  7.94it/s]


Epoch 9/10 (88.36s) | Train Loss: 0.0090 Acc: 0.9968 | Valid Loss: 0.0087 Acc: 0.9973
Validation accuracy did not improve. Patience: 2/7
üíæ Checkpoint saved for epoch 9.



Epoch 10/10 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 549/549 [01:10<00:00,  7.77it/s]
Epoch 10/10 [Valid]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 138/138 [00:16<00:00,  8.56it/s]


Epoch 10/10 (86.79s) | Train Loss: 0.0027 Acc: 0.9991 | Valid Loss: 0.0046 Acc: 0.9993
Validation accuracy did not improve. Patience: 3/7
üíæ Checkpoint saved for epoch 10.

--- TRAINING COMPLETE ---
Total training time: 885.07 seconds
Best model saved to: /content/drive/My Drive/DeepFakeDataset/NEW MODELS/NEW/min-dalle-simple-cnn-v2_best_model.pth
