In [1]:
import os
import cv2
import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from transformers import ViTFeatureExtractor, ViTForImageClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

In [3]:
# Define a custom dataset class
class VideoFrameDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.frames = []
        self.labels = []

        # Load data paths and labels
        for label, category in enumerate(['original', 'altered']):
            category_dir = os.path.join(root_dir, category)
            for video_file in os.listdir(category_dir):
                video_path = os.path.join(category_dir, video_file)
                self.frames.append(video_path)
                self.labels.append(label)

    def __len__(self):
        return len(self.frames)

    def __getitem__(self, idx):
        # Load video frame
        video_path = self.frames[idx]
        cap = cv2.VideoCapture(video_path)
        ret, frame = cap.read()
        cap.release()

        if not ret:
            raise ValueError(f"Unable to read frame from {video_path}")

        # Convert BGR to RGB
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Apply transformations
        if self.transform:
            frame = self.transform(frame)

        label = self.labels[idx]
        return frame, label

In [4]:
# Define transformations
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [5]:
# Initialize ViT feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")



In [6]:
# Custom function to preprocess and batch images
def preprocess_and_batch(frames):
    # Convert frames to PIL images and apply transformations
    frames = [transform(frame) for frame in frames]
    # Batch frames
    batch = torch.stack(frames)
    return batch

In [7]:
# Define dataset paths

# datadir = '/content/drive/MyDrive/Deepfake/'
datadir = './dataset/'

# train_dir = datadir + "trainnew"
# val_dir = datadir + "valnew"
# test_dir = datadir + "testnew"
train_dir = datadir + "train"
val_dir = datadir + "val"
test_dir = datadir + "test"

In [8]:
# Create datasets
train_dataset = VideoFrameDataset(root_dir=train_dir, transform=transform)
val_dataset = VideoFrameDataset(root_dir=val_dir, transform=transform)
test_dataset = VideoFrameDataset(root_dir=test_dir, transform=transform)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize ViT feature extractor and model
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224",
    num_labels=2,  # Specify the number of classes here
    ignore_mismatched_sizes=True  # This will ignore the size mismatch
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [10]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    correct = 0
    total = 0
    model.train()
    for batch_idx, (frames, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        # Convert frames to PIL images
        frames = [transforms.ToPILImage()(frame) for frame in frames]

        # Process frames through feature extractor

        inputs = feature_extractor(images=frames, return_tensors="pt").pixel_values.squeeze(1)
        outputs = model(inputs).logits

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        if batch_idx % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item()}")

    # Print accuracy after each epoch
    accuracy = 100 * correct / total
    print(f'Accuracy of the model after epoch {epoch+1} is: {accuracy}%')

Epoch [1/10], Batch [1/44], Loss: 0.6924560070037842
Epoch [1/10], Batch [11/44], Loss: 0.7323794960975647
Epoch [1/10], Batch [21/44], Loss: 0.7128832936286926
Epoch [1/10], Batch [31/44], Loss: 0.7148308753967285
Epoch [1/10], Batch [41/44], Loss: 0.7348440289497375
Accuracy of the model after epoch 1 is: 45.80965909090909%
Epoch [2/10], Batch [1/44], Loss: 0.7213559150695801
Epoch [2/10], Batch [11/44], Loss: 0.6941457986831665
Epoch [2/10], Batch [21/44], Loss: 0.6998175382614136
Epoch [2/10], Batch [31/44], Loss: 0.6839178204536438
Epoch [2/10], Batch [41/44], Loss: 0.7202833294868469
Accuracy of the model after epoch 2 is: 50.28409090909091%
Epoch [3/10], Batch [1/44], Loss: 0.6749675869941711
Epoch [3/10], Batch [11/44], Loss: 0.6948589086532593
Epoch [3/10], Batch [21/44], Loss: 0.6302136778831482
Epoch [3/10], Batch [31/44], Loss: 0.6979066133499146
Epoch [3/10], Batch [41/44], Loss: 0.7203617691993713
Accuracy of the model after epoch 3 is: 58.38068181818182%
Epoch [4/10], Ba

In [11]:
import torch

# Training loop
num_epochs = 10
best_val_accuracy = 0  # Variable to track the best validation accuracy

for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_correct = 0
    train_total = 0
    for batch_idx, (frames, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        # Convert frames to PIL images
        frames = [transforms.ToPILImage()(frame) for frame in frames]

        # Process frames through feature extractor
        inputs = feature_extractor(images=frames, return_tensors="pt").pixel_values.squeeze(1)
        outputs = model(inputs).logits

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Calculate training accuracy
        _, predicted = torch.max(outputs.data, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = 100 * train_correct / train_total
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Accuracy: {train_accuracy}%')

    # Validation phase
    model.eval()
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for frames, labels in val_loader:
            # Convert frames to PIL images
            frames = [transforms.ToPILImage()(frame) for frame in frames]

            # Process frames through feature extractor
            inputs = feature_extractor(images=frames, return_tensors="pt").pixel_values.squeeze(1)
            outputs = model(inputs).logits

            # Calculate validation accuracy
            _, predicted = torch.max(outputs.data, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Accuracy: {val_accuracy}%')

    # Save the model if it has the best validation accuracy so far
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), f"best_model_epoch_{epoch+1}.pth")
        print(f"Best model saved at epoch {epoch+1} with validation accuracy: {val_accuracy}%")

    # Test phase
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for frames, labels in test_loader:
            # Convert frames to PIL images
            frames = [transforms.ToPILImage()(frame) for frame in frames]

            # Process frames through feature extractor
            inputs = feature_extractor(images=frames, return_tensors="pt").pixel_values.squeeze(1)
            outputs = model(inputs).logits

            # Calculate test accuracy
            _, predicted = torch.max(outputs.data, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()

    test_accuracy = 100 * test_correct / test_total
    print(f'Epoch [{epoch+1}/{num_epochs}], Test Accuracy: {test_accuracy}%')


Epoch [1/10], Train Accuracy: 93.67897727272727%
Epoch [1/10], Validation Accuracy: 65.56291390728477%
Best model saved at epoch 1 with validation accuracy: 65.56291390728477%
Epoch [1/10], Test Accuracy: 64.78405315614619%
Epoch [2/10], Train Accuracy: 94.74431818181819%
Epoch [2/10], Validation Accuracy: 69.86754966887418%
Best model saved at epoch 2 with validation accuracy: 69.86754966887418%
Epoch [2/10], Test Accuracy: 65.4485049833887%
Epoch [3/10], Train Accuracy: 94.38920454545455%
Epoch [3/10], Validation Accuracy: 67.21854304635761%
Epoch [3/10], Test Accuracy: 62.458471760797345%
Epoch [4/10], Train Accuracy: 96.5909090909091%
Epoch [4/10], Validation Accuracy: 69.5364238410596%
Epoch [4/10], Test Accuracy: 65.4485049833887%
Epoch [5/10], Train Accuracy: 97.51420454545455%
Epoch [5/10], Validation Accuracy: 68.54304635761589%
Epoch [5/10], Test Accuracy: 64.45182724252491%
Epoch [6/10], Train Accuracy: 96.73295454545455%
Epoch [6/10], Validation Accuracy: 67.54966887417218%

In [14]:
import torch
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

No GPU available. Training will run on CPU.
