In [9]:
from pathlib import Path
import pandas as pd
import os 
import json
pd.options.mode.chained_assignment = None

In [10]:
DRIVE_PATH = Path("/mnt/h/")
KODF_PATH = DRIVE_PATH / "kodf_release"
DFDC_PATH = DRIVE_PATH / "dfdc_extracted"

# Pre-Proc KODF

In [11]:
# Uncomment for processing 
from deepfake_detection.defaults import RANDOM_STATE
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

def get_kodf_train_val_people(unique_persons: list[str], train_size: float = 0.8) -> tuple[list[str], list[str]]:
    shuffled_people = shuffle(unique_persons, random_state=RANDOM_STATE)
    train_people, val_people = train_test_split(shuffled_people, train_size=train_size)

    return train_people, val_people

kodf_synthesized_metadata = pd.read_csv(KODF_PATH / "synthesized_video_metadata.csv")
kodf_synthesized_videos_to_keep = kodf_synthesized_metadata[~kodf_synthesized_metadata.model.isin(["fo", "audio-driven"])]
kodf_synthesized_videos_to_keep['video_path'] = KODF_PATH.relative_to(DRIVE_PATH) / "synthesized_videos" / kodf_synthesized_videos_to_keep['model'] / kodf_synthesized_videos_to_keep['date_time'].str.replace("-", "") / kodf_synthesized_videos_to_keep['target_person_id'] / kodf_synthesized_videos_to_keep['video']
kodf_synthesized_videos_to_keep['video_path'] = kodf_synthesized_videos_to_keep['video_path'].apply(lambda x: str(x))
kodf_synthesized_videos_to_keep['label'] = "FAKE"
kodf_synthesized_videos_to_keep.drop(["sex", "date_time", "model", "target_person_id", "target_video"], axis=1, inplace=True)
kodf_synthesized_videos_to_keep.rename(columns={"source_person_id": "person_id"}, inplace=True)
kodf_synthesized_videos_to_keep.set_index("video", inplace=True)

kodf_original_metadata = pd.read_csv(KODF_PATH / "original_video_metadata.csv")
kodf_original_metadata['video_path'] = KODF_PATH.relative_to(DRIVE_PATH) / "original_videos" / kodf_original_metadata["person_id"] / kodf_original_metadata["video"]
kodf_original_metadata['video_path'] = kodf_original_metadata['video_path'].apply(lambda x: str(x))
kodf_original_metadata['label'] = "REAL"
kodf_original_metadata.drop(["sex", "date_time", "studio"], axis=1, inplace=True)
kodf_original_metadata.set_index("video", inplace=True)

kodf_combined = pd.concat([kodf_synthesized_videos_to_keep, kodf_original_metadata])

train_people, val_people = get_kodf_train_val_people(kodf_combined['person_id'].unique())

kodf_combined.loc[kodf_combined['person_id'].isin(train_people), "split"] = "train"
kodf_combined.loc[kodf_combined['person_id'].isin(val_people), "split"] = "validation"

bad_paths = []

for path in kodf_combined['video_path']:
    if not os.path.exists(DRIVE_PATH / path):
        print(f"Bad path: {path}")
        bad_paths.append(path)
    
kodf_combined = kodf_combined[~kodf_combined['video_path'].isin(bad_paths)]
kodf_dict = kodf_combined.to_dict(orient="index")

with open(KODF_PATH / "master_metadata.json", 'w') as f:
    json.dump(kodf_dict, f)

Bad path: kodf_release/synthesized_videos/fsgan/20201014/39549/39549_177877_3_40.mp4
Bad path: kodf_release/synthesized_videos/2020-09-24/여성/175261/175261__1_0600.mp4
Bad path: kodf_release/synthesized_videos/2020-09-24/여성/175261/175261__1_0640.mp4
Bad path: kodf_release/synthesized_videos/2020-09-24/여성/175261/175261__1_0680.mp4
Bad path: kodf_release/synthesized_videos/2020-09-24/여성/175261/175261__1_0690.mp4
Bad path: kodf_release/synthesized_videos/2020-09-24/여성/175261/175261__1_0700.mp4
Bad path: kodf_release/synthesized_videos/2020-09-24/여성/175261/175261__1_0710.mp4
Bad path: kodf_release/synthesized_videos/2020-09-24/여성/175261/175261__1_0720.mp4
Bad path: kodf_release/synthesized_videos/2020-09-24/여성/175261/175261__1_0730.mp4
Bad path: kodf_release/synthesized_videos/2020-09-24/여성/175261/175261__1_0740.mp4
Bad path: kodf_release/synthesized_videos/2020-09-24/여성/175261/175261__1_0750.mp4
Bad path: kodf_release/synthesized_videos/2020-09-24/여성/175261/175261__1_0760.mp4
Bad path: kod

In [12]:
# Uncomment for processing 

parts = [i for i in range(50)]
master_metadata = {}
for part in parts:
    print(f"Processing Part: {part}")
    folder_name = f"dfdc_train_part_{str(part).zfill(2)}"
    inner_folder_name = f"dfdc_train_part_{part}"
    current_part_path = DFDC_PATH / folder_name / inner_folder_name
    metadata_path = current_part_path / "metadata.json" 
    part_meta = {}
    with open(metadata_path) as f:
        meta_data = json.load(f)
    for video, data in meta_data.items():
        video_path = current_part_path / video
        if not os.path.exists(video_path) == True:
            print("CURRENT VIDEO PATH", video_path, video, data)
            continue
        if 'original' in data.keys():
            del data['original']

        data['video_path'] = str(video_path.relative_to(DRIVE_PATH))
        part_meta[video] = data
    master_metadata.update(part_meta)

with open(DFDC_PATH / "train_metadata.json", 'w') as f:
    json.dump(master_metadata, f)

Processing Part: 0
Processing Part: 1
Processing Part: 2
Processing Part: 3
Processing Part: 4
Processing Part: 5
Processing Part: 6
Processing Part: 7
Processing Part: 8
Processing Part: 9
Processing Part: 10
Processing Part: 11
Processing Part: 12
Processing Part: 13
Processing Part: 14
Processing Part: 15
Processing Part: 16
Processing Part: 17
Processing Part: 18
CURRENT VIDEO PATH /mnt/h/dfdc_extracted/dfdc_train_part_18/dfdc_train_part_18/wipjitfmta.mp4 wipjitfmta.mp4 {'label': 'FAKE', 'split': 'train', 'original': 'suybcasguz.mp4'}
CURRENT VIDEO PATH /mnt/h/dfdc_extracted/dfdc_train_part_18/dfdc_train_part_18/wpuxmawbkj.mp4 wpuxmawbkj.mp4 {'label': 'FAKE', 'split': 'train', 'original': 'paupkdijut.mp4'}
CURRENT VIDEO PATH /mnt/h/dfdc_extracted/dfdc_train_part_18/dfdc_train_part_18/pvohowzowy.mp4 pvohowzowy.mp4 {'label': 'FAKE', 'split': 'train', 'original': 'bgcvbayfhn.mp4'}
Processing Part: 19
Processing Part: 20
Processing Part: 21
Processing Part: 22
Processing Part: 23
Proce

In [None]:
kodf_df = pd.read_json(KODF_PATH / "master_metadata.json", orient="index")
dfdc_df = pd.read_json(DFDC_PATH / "train_metadata.json", orient="index")
kodf_df

In [None]:
final_dataset = pd.concat([kodf_df, dfdc_df])
final_dataset.reset_index(inplace=True)
final_dataset.columns = ["video", "video_path", "label"]
final_dataset.to_csv("metadata.csv", index=False)