In [1]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
from pytorchvideo.data.encoded_video import EncodedVideo
from transformers import BertTokenizer
from torchvision.transforms import Compose, Lambda, Resize, Normalize, ColorJitter

from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)



### [IGNORE] Initial Transform

In [2]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 8
sampling_rate = 8
frames_per_second = 30

# Note that this transform is specific to the slow_R50 model.
transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(8),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size=(crop_size, crop_size))
        ]
    ),
)

### [IGNORE] Sampling frames per clip divided over the video

In [3]:
video_root = '../train'
index = 0
data = pd.read_csv('../train.csv')


In [4]:
data.iloc[index]['video_id']+".mp4"

'KabnUV5luJ8.mp4'

In [5]:
video_path = os.path.join(video_root, data.iloc[index]['video_id']+".mp4")
label = data.iloc[index]['label']
text = data.iloc[index]['text']

In [6]:
print(f'video_path: {video_path}')

video_path: ../train/KabnUV5luJ8.mp4


In [7]:
print(f'label: {label}')

label: 1


In [8]:
print(f'text: {text}')

text: please name of the software deep fake try putting great thunberg on hitler true it wud be hard to tell dude you are too good cuz she looks like melania trump


In [9]:
# Load video using PyTorchVideo
video = EncodedVideo.from_path(video_path)

In [10]:
print(video)

<pytorchvideo.data.encoded_video_pyav.EncodedVideoPyAV object at 0x786f383f5580>


In [11]:
# Get video duration and calculate the step size for frame sampling
duration = video.duration
step = duration / num_frames
print(f'duration: {duration}')
print(f'step: {step}')

duration: 561152/11025
step: 70144/11025


In [12]:
# Sample frames at regular intervals
video_data = []
for i in range(num_frames):
    start_sec = i * step
    end_sec = start_sec + step
    clip = video.get_clip(start_sec=start_sec, end_sec=end_sec)
    video_data.append(transform(clip))

KeyboardInterrupt: 

In [None]:
print(len(video_data))

32


In [None]:
print(video_data[0]['video'].shape)

torch.Size([3, 1, 256, 256])


In [None]:
# Stack the sampled frames
# Extract video tensors from each dictionary
video_tensors = [item['video'] for item in video_data]

# Stack video tensors along the frames dimension
stacked_video = torch.stack(video_tensors)

In [None]:
stacked_video = stacked_video.squeeze(2).permute(1, 0, 2, 3)

In [None]:
stacked_video.shape

torch.Size([3, 32, 256, 256])

In [None]:
model = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True)

Using cache found in /home/davendra/.cache/torch/hub/facebookresearch_pytorchvideo_main


In [None]:
input = stacked_video.unsqueeze(0)
input.shape

torch.Size([1, 3, 32, 256, 256])

In [None]:
pred = model(input)
pred.shape

torch.Size([1, 400])

In [None]:
feature_extractor = torch.nn.Sequential(*list(model.blocks.children())[:-1])

In [None]:
pred2 = feature_extractor(input)
pred2.shape

torch.Size([1, 2048, 32, 8, 8])

In [None]:
pred3 = pred2.permute(0, 2, 3, 4, 1)
pred3.shape

torch.Size([1, 32, 8, 8, 2048])

### [IGNORE] Sampling over the entire video

In [None]:
video_root = '../train'
index = 0
data = pd.read_csv('../train.csv')

In [None]:
video_path = os.path.join(video_root, data.iloc[index]['video_id']+".mp4")
label = data.iloc[index]['label']
text = data.iloc[index]['text']

In [None]:
print(f'video_path: {video_path}')

video_path: ../train/KabnUV5luJ8.mp4


In [None]:
print(f'label: {label}')

label: 1


In [None]:
print(f'text: {text}')

text: please name of the software deep fake try putting great thunberg on hitler true it wud be hard to tell dude you are too good cuz she looks like melania trump


In [None]:
clip_duration = (num_frames * sampling_rate)/frames_per_second
start_sec = 0
end_sec = start_sec + clip_duration

# Initialize an EncodedVideo helper class and load the video
video = EncodedVideo.from_path(video_path)

print(video)

<pytorchvideo.data.encoded_video_pyav.EncodedVideoPyAV object at 0x78e3398e71d0>


In [None]:
# Load the desired clip
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

print(video_data['video'].shape)

# Apply a transform to normalize the video input
video_data = transform(video_data)

print(video_data['video'].shape)

torch.Size([3, 64, 720, 1280])
torch.Size([3, 8, 256, 256])


### Dataset Object

In [2]:
class DeepFakeDataset(Dataset):
    def __init__(self, video_path_file, text_csv_file, text_transforms=None, video_transforms=None, num_frames=8, sampling_rate=8, frames_per_second=30):
        # self.root = video_root
        self.video_annotation = pd.read_csv(video_path_file)
        self.text_df = pd.read_csv(text_csv_file)
        self.text_transforms = text_transforms
        self.video_transforms = video_transforms
        self.num_frames = num_frames
        self.sampling_rate = sampling_rate
        self.frames_per_second = frames_per_second

    def __len__(self):
        return len(self.video_annotation)

    def __getitem__(self, index):

        video_path = self.video_annotation.iloc[index]['video_path']
        label = self.video_annotation.iloc[index]['label']
        text = self.text_df.iloc[index]['text']

        try:
            # Load video using PyTorchVideo
            video = EncodedVideo.from_path(video_path)

            # Get video duration and calculate the step size for frame sampling
            duration = video.duration
            step = duration / self.num_frames
            print(f'Video length: {duration}')

            # Sample frames at regular intervals
            video_data = []
            for i in range(self.num_frames):
                start_sec = i * step
                end_sec = start_sec + step
                clip = video.get_clip(start_sec=start_sec, end_sec=end_sec)
                print(f'clip shape: {clip['video'].shape}')
                transformed_clip = self.video_transforms(clip['video'])
                print(f'Transformed Clip: {transformed_clip.shape}')
                video_data.append(transformed_clip)

            # Stack the sampled frames
            video_data = torch.stack(video_data)
            # print(video_data)
            print(f'Video_data shape: {video_data.shape}')
            print('--------------------------------------------\n')
            # Stack the sampled frames
            # Extract video tensors from each dictionary
            # video_tensors = [item['video'] for item in video_data]

            # Stack video tensors along the frames dimension
            # video_frames = torch.stack(video_tensors).squeeze(2).permute(1, 0, 2, 3)

            # clip_duration = min((num_frames * sampling_rate) / frames_per_second, video.duration)
            # start_sec = 0
            # clip_duration = video.duration
            # end_sec = start_sec + clip_duration

            # video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
            # print(f'video_data shape: {video_data['video'].shape}')

            # # Apply a transform to normalize the video input
            # if self.video_transforms:
            #     video_data = self.video_transforms(video_data['video'])
            #     print(f'Transformed Video data shape: {video_data.shape}')

        except Exception as e:
            print(f'Error Processing video {video_path}: {e}')
            # print(f'Clip Duration: {clip_duration}')
            # print(f'Video Duration: {video.duration}')

        # Apply text transforms
        # if self.text_transforms:
        #     text_data = self.text_transforms(text)
        # else:
        
        text_data = text

        return {
            'video': video_data,
            'text': text_data,
            'label': torch.tensor(label, dtype=torch.long)
        }

### Example using Dataset

In [3]:
# Create instances of text and video transforms
text_transforms = Compose([
    BertTokenizer.from_pretrained('bert-base-uncased'),
    # Add more text transformations as needed
])

side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 30
sampling_rate = 8
frames_per_second = 30


video_transforms = Compose(
        [
            UniformTemporalSubsample(1),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size=(crop_size, crop_size))
        ]
    )

# Create an instance of the dataset
video_path_file = '../annotations/video_train_path.csv'
text_csv_file = '../annotations/text_train.csv'

dataset = DeepFakeDataset(
                            video_path_file=video_path_file,
                            text_csv_file=text_csv_file,
                            text_transforms=text_transforms,
                            video_transforms=video_transforms,
                            num_frames=num_frames,
                            sampling_rate=sampling_rate,
                            frames_per_second=frames_per_second
                        )

In [4]:
# Create a dataloader
batch_size = 8
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=1)

In [5]:
cnt = 0
video_train=list()
for i, data in enumerate(dataloader):
    video_train.append(data['video'])
    print(f'Finish Batch {i}')
    print('--------------------------------------------\n')
    cnt += 1

    if cnt == 4:
        break

Video length: 349349/7500
clip shape: torch.Size([3, 47, 720, 1280])
Transformed Clip: torch.Size([3, 1, 256, 256])
clip shape: torch.Size([3, 47, 720, 1280])
Transformed Clip: torch.Size([3, 1, 256, 256])
clip shape: torch.Size([3, 46, 720, 1280])
Transformed Clip: torch.Size([3, 1, 256, 256])
clip shape: torch.Size([3, 47, 720, 1280])
Transformed Clip: torch.Size([3, 1, 256, 256])
clip shape: torch.Size([3, 46, 720, 1280])
Transformed Clip: torch.Size([3, 1, 256, 256])
clip shape: torch.Size([3, 47, 720, 1280])
Transformed Clip: torch.Size([3, 1, 256, 256])
clip shape: torch.Size([3, 46, 720, 1280])
Transformed Clip: torch.Size([3, 1, 256, 256])
clip shape: torch.Size([3, 47, 720, 1280])
Transformed Clip: torch.Size([3, 1, 256, 256])
clip shape: torch.Size([3, 46, 720, 1280])
Transformed Clip: torch.Size([3, 1, 256, 256])
clip shape: torch.Size([3, 47, 720, 1280])
Transformed Clip: torch.Size([3, 1, 256, 256])
clip shape: torch.Size([3, 46, 720, 1280])
Transformed Clip: torch.Size([3

In [37]:
video_train[0].shape

torch.Size([8, 30, 3, 1, 256, 256])

In [38]:
video = video_train[0].squeeze(3).permute(0, 2, 1, 3, 4)
video.shape

torch.Size([8, 3, 30, 256, 256])

In [32]:
import torch
# Choose the `slow_r50` model 
model = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True)

Using cache found in /home/davendra/.cache/torch/hub/facebookresearch_pytorchvideo_main


In [39]:
model = model.eval()


In [40]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [41]:
input = video[0].unsqueeze(0)
input.shape

torch.Size([1, 3, 30, 256, 256])

In [42]:
pred = model(input)

In [43]:
pred.shape

torch.Size([1, 400])

In [44]:
input.shape

torch.Size([1, 3, 30, 256, 256])

In [45]:
resnet_feature_extractor = torch.nn.Sequential(*list(model.children())[:-1])

In [46]:
output = resnet_feature_extractor(input)

In [47]:
output.shape

torch.Size([1, 3, 30, 256, 256])

In [13]:
inputs = video.to(device)

In [14]:
model = model.to(device)

In [15]:
next(model.parameters()).is_cuda

True

In [16]:
inputs.is_cuda

True

In [None]:
inputs.to("cpu")

In [21]:
inputs.cpu()

tensor([[[[[ 4.0425e-01,  4.0425e-01,  4.0425e-01,  ..., -1.5449e+00,
            -1.5449e+00, -1.5449e+00],
           [ 3.6529e-01,  3.6529e-01,  3.6529e-01,  ..., -1.5007e+00,
            -1.5007e+00, -1.5007e+00],
           [ 3.4475e-01,  3.4475e-01,  3.4475e-01,  ..., -1.4946e+00,
            -1.4946e+00, -1.4946e+00],
           ...,
           [-9.8696e-01, -1.1350e+00, -1.2726e+00,  ..., -8.1645e-01,
            -8.3224e-01, -8.3224e-01],
           [-1.0240e+00, -1.2354e+00, -1.4418e+00,  ..., -8.2414e-01,
            -8.3224e-01, -8.3224e-01],
           [-1.1967e+00, -1.4319e+00, -1.5001e+00,  ..., -8.4459e-01,
            -8.4459e-01, -8.4459e-01]],

          [[ 1.2258e-01,  1.4953e-01,  1.9194e-01,  ..., -1.3352e+00,
            -1.1919e+00, -1.1583e+00],
           [ 3.3662e-01,  3.5989e-01,  3.6172e-01,  ..., -1.3240e+00,
            -1.1820e+00, -1.1583e+00],
           [ 4.6741e-01,  4.6161e-01,  4.6161e-01,  ..., -1.3137e+00,
            -1.2048e+00, -1.1552e+00],
 

In [23]:
input = inputs[0].unsqueeze(0)
input.shape

torch.Size([1, 3, 30, 256, 256])

In [24]:
input = input.to(device)

In [None]:
pred = model(input)