In [1]:
!ls /kaggle/input/dfdc-sample/dfdc-sample | wc

  16210   16210  243149


### kreiranje balansiranog dataseta za train/validation split

In [2]:
import os
import glob
import json
import cv2
import pandas as pd
import numpy as np

In [3]:
file_path = "/kaggle/input/dfdc-sample/dfdc-sample/metadata.json"

with open(file_path, "r") as file:
    data = json.load(file)

In [4]:
type(data)

dict

In [5]:
realVideos = [k for k, v in data.items() if data[k]['label'] == 'REAL']
fakeVideos = [k for k, v in data.items() if data[k]['label'] == 'FAKE']

In [6]:
len(realVideos), len(fakeVideos)

(2450, 13759)

In [7]:
df = pd.DataFrame.from_dict(data, orient='index').reset_index()

In [8]:
df.head()

Unnamed: 0,index,label,split,original
0,hsypgwsufp.mp4,FAKE,train,nbnipejygk.mp4
1,ntzgbkzofo.mp4,FAKE,train,cqlarprtdy.mp4
2,ataulynpgd.mp4,FAKE,train,uzrkbzwdvi.mp4
3,idzntwkkjy.mp4,FAKE,train,lvnjzrvzwy.mp4
4,rdqokuannd.mp4,FAKE,train,mujubwlspn.mp4


In [9]:
df.rename(columns={'index': 'filename'}, inplace=True)

In [10]:
df.shape

(16209, 4)

In [11]:
df.label.value_counts(normalize=True)

label
FAKE    0.848849
REAL    0.151151
Name: proportion, dtype: float64

In [12]:
real_df = df[df['label'] == "REAL"]
fake_df = df[df['label'] == "FAKE"]

In [13]:
fake_df_sampled = fake_df.sample(n=len(real_df), random_state=42)

In [14]:
balanced_df = pd.concat([real_df, fake_df_sampled]).reset_index(drop=True)

In [15]:
balanced_df.label.value_counts()

label
REAL    2450
FAKE    2450
Name: count, dtype: int64

In [16]:
def saveToJSON(df, path):
    df_copy = df.copy()
    df_copy['original'] = df_copy['original'].where(df_copy['label'] != 'REAL', None)
    
    df_copy.set_index('filename', inplace=True)
    
    result = df_copy.to_dict(orient='index')
    
    for key, value in result.items():
        if value.get('original') is None:
            del value['original']
    
    json_data = json.dumps(result, indent=2)
    
    with open(path, 'w') as file:
        file.write(json_data)

In [17]:
saveToJSON(balanced_df.copy(), 'balanced.json')

In [18]:
balanced_df.head()

Unnamed: 0,filename,label,split,original
0,jickjfbicd.mp4,REAL,train,
1,kclfhzfwpn.mp4,REAL,train,
2,ibgrtmlmjk.mp4,REAL,train,
3,rcttjovqdv.mp4,REAL,train,
4,ruraoelttp.mp4,REAL,train,


In [19]:
video_dir = "/kaggle/input/dfdc-sample/dfdc-sample"
# json_path = os.path.join(video_dir, "balanced.json")
json_path = "/kaggle/working/balanced.json"

In [20]:
with open(json_path, "r") as f:
    data = json.load(f)

In [21]:
video_files = list(data.keys())

print("Number of videos in balanced.json file:", len(video_files))

Number of videos in balanced.json file: 4900


In [22]:
existing_videos = []
missing_videos = []
for vf in video_files:
    full_path = os.path.join(video_dir, vf)
    if os.path.exists(full_path):
        existing_videos.append(vf)
    else:
        missing_videos.append(vf)

print("video dir U balanced.json: ", len(existing_videos))
if missing_videos:
    print("missing videos: ", missing_videos)

video dir U balanced.json:  4900


In [23]:
output_dir = "/kaggle/working/preprocessed_data"
os.makedirs(output_dir, exist_ok=True)

In [24]:
with open(json_path, "r") as f:
    data = json.load(f)

In [25]:
face_cascade_path = "/kaggle/input/haarcascade/haarcascade_frontalface_default.xml"
face_cascade = cv2.CascadeClassifier(face_cascade_path)

In [26]:
frame_interval = 30
output_size = (224, 224)

labels = {}

In [27]:
def get_closest_face(previous_bbox, current_faces):
    current_faces = list(current_faces)
    if len(current_faces) == 0:
        return None
    
    px = previous_bbox[0] + previous_bbox[2] / 2
    py = previous_bbox[1] + previous_bbox[3] / 2
    
    min_dist = float('inf')
    closest_face = None
    for (x,y,w,h) in current_faces:
        cx = x + w/2
        cy = y + h/2
        dist = (px - cx)**2 + (py - cy)**2
        if dist < min_dist:
            min_dist = dist
            closest_face = (x,y,w,h)
    
    return closest_face

In [None]:
for video_name, info in data.items():
    label = info["label"]
    
    video_path = os.path.join(video_dir, video_name)
    if not os.path.exists(video_path):
        print(f"Video {video_name} not present {video_dir}")
        continue
    
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"error occured while trying to open a video: {video_path}")
        continue
    
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_indices = list(range(0, frame_count, frame_interval))

    video_output_dir = os.path.join(output_dir, os.path.splitext(video_name)[0])
    os.makedirs(video_output_dir, exist_ok=True)

    previous_face = None
    face_found_first = False
    found_any_face = False

    saved_frames_count = 0

    for i, f_idx in enumerate(frame_indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, f_idx)
        ret, frame = cap.read()
        if not ret:
            break
        
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(50,50))
        
        face_to_use = None
        if i == 0:
            if len(faces) > 0:
                areas = [(w*h, (x,y,w,h)) for (x,y,w,h) in faces]
                areas.sort(key=lambda x: x[0], reverse=True)
                face_to_use = areas[0][1]
                face_found_first = True
            else:
                pass
        else:
            if len(faces) > 0 and previous_face is not None:
                face_to_use = get_closest_face(previous_face, faces)
            elif previous_face is not None:
                face_to_use = previous_face
            else:
                if not face_found_first:
                    pass

        if face_to_use is not None:
            found_any_face = True
            previous_face = face_to_use
            (x, y, w_, h_) = face_to_use

            face_img = frame[y:y+h_, x:x+w_]
            if face_img.shape[0] == 0 or face_img.shape[1] == 0:
                continue
            
            face_img = cv2.resize(face_img, output_size)
            face_img_rgb = cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB)
            
            out_path = os.path.join(video_output_dir, f"frame_{f_idx}.jpg")
            cv2.imwrite(out_path, face_img_rgb)
            saved_frames_count += 1

    cap.release()

    if found_any_face and saved_frames_count > 0:
        labels[video_name] = label
    else:
        if os.path.exists(video_output_dir):
            try:
                os.rmdir(video_output_dir)
            except OSError:
                pass

labels_path = os.path.join(output_dir, "labels.json")
with open(labels_path, "w") as f:
    json.dump(labels, f, indent=2)

print("done")