In [1]:
!git clone https://github.com/kiyoshi2000/automathon-2024-B.git

Cloning into 'automathon-2024-B'...
remote: Enumerating objects: 401, done.[K
remote: Counting objects: 100% (147/147), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 401 (delta 143), reused 142 (delta 142), pack-reused 254[K
Receiving objects: 100% (401/401), 947.36 KiB | 18.22 MiB/s, done.
Resolving deltas: 100% (229/229), done.


In [2]:
!pip install -r automathon-2024-B/requirements.txt

Collecting appnope==0.1.4 (from -r automathon-2024-B/requirements.txt (line 2))
  Downloading appnope-0.1.4-py2.py3-none-any.whl.metadata (908 bytes)
Collecting comm==0.2.2 (from -r automathon-2024-B/requirements.txt (line 7))
  Downloading comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)
Collecting contourpy==1.2.1 (from -r automathon-2024-B/requirements.txt (line 8))
  Downloading contourpy-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)
Collecting debugpy==1.8.1 (from -r automathon-2024-B/requirements.txt (line 10))
  Downloading debugpy-1.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting exceptiongroup==1.2.1 (from -r automathon-2024-B/requirements.txt (line 13))
  Downloading exceptiongroup-1.2.1-py3-none-any.whl.metadata (6.6 kB)
Collecting filelock==3.13.4 (from -r automathon-2024-B/requirements.txt (line 15))
  Downloading filelock-3.13.4-py3-none-any.whl.metadata (2.8 kB)
Collecting fonttools==4.51.0 (fr

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary
import torch.optim as optim
import torchvision.io as io
import os
import json
from tqdm import tqdm
import csv
import timm
import wandb

from PIL import Image
import torchvision.transforms as transforms

In [4]:
class VideoDataset(Dataset):
    """
    This Dataset takes a video and returns a tensor of shape [10, 3, 256, 256]
    That is 10 colored frames of 256x256 pixels.
    """
    def __init__(
        self, root_dir, dataset_choice="train", nb_frames=10, trans=None
    ):
        super().__init__()
        self.dataset_choice = dataset_choice
        self.transforms = trans
        if  self.dataset_choice == "train":
            self.root_dir = os.path.join(root_dir, "dataset/train_dataset")
        elif  self.dataset_choice == "test":
            self.root_dir = os.path.join(root_dir, "dataset/test_dataset")
        elif  self.dataset_choice == "experimental":
            self.root_dir = os.path.join(root_dir, "dataset/experimental_dataset")
        else:
            raise ValueError("choice must be 'train', 'test' or 'experimental'")

        with open(os.path.join(root_dir, "dataset.csv"), 'r') as file:
            reader = csv.reader(file)
            # read dataset.csv with id,label columns to create
            # a dict which associated label: id
            self.ids = {row[1] : row[0] for row in reader}

        if self.dataset_choice == "test":
            self.data = None
        else:
            with open(os.path.join(self.root_dir, "metadata.json"), 'r') as file:
                self.data= json.load(file)
                self.data = {k : (torch.tensor(float(1)) if v == 'FAKE' else torch.tensor(float(0))) for k, v in self.data.items()}

        self.video_files = [f for f in os.listdir(self.root_dir) if f.endswith('.mp4')]
        
    def __len__(self):
        return len(self.video_files)

    def __getitem__(self, idx):
        video_path = os.path.join(self.root_dir, self.video_files[idx])
        video, audio, info = io.read_video(video_path, pts_unit='sec')
        
        video = video.permute(0,3,1,2)
        length = video.shape[0]
        video = video[[i*(length//(nb_frames)) for i in range(nb_frames)]]

        # resize the data into a reglar shape of 256x256 and normalize it
        #video = smart_resize(video, 256) / 255
        video = video / 255
        video = self._apply_transforms(video)

        ID = self.ids[self.video_files[idx]]
        if self.dataset_choice == "test":
            return video, ID
        else:
            label = self.data[self.video_files[idx]]
            return video, label, ID
        
    def _apply_transforms(self, stack):
        """apply the transforms to the stack of frames"""
        if self.transforms is None:
            return stack
        return self.transforms(stack)
    


In [7]:
import timm 

dataset_dir = "/kaggle/input/automathon-deepfake"
nb_frames   = 10

# grayscale and flip
trans = transforms.Compose([
        transforms.Grayscale(),
        torch.squeeze,
        transforms.RandomHorizontalFlip(),
    ])

experimental_dataset = VideoDataset(
    dataset_dir,
    dataset_choice="experimental",
    nb_frames=nb_frames,
    trans=trans
)

train_dataloader = DataLoader(experimental_dataset, batch_size=64, shuffle=True)
model = timm.create_model("resnet18", pretrained=True, num_classes=2, in_chans=nb_frames)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCELoss()
            
trainer = Trainer(model, train_dataloader, loss_fn, optimizer, None)
trainer.train(1)

model.safetensors:   0%|          | 0.00/379M [00:00<?, ?B/s]

Beit(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(10, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=False)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (drop