In [2]:
import os
import shutil
from google.colab import drive

try:
    drive.flush_and_unmount()
except:
    pass

if os.path.exists('/content/drive'):
    shutil.rmtree('/content/drive')

drive.mount('/content/drive')

Drive not mounted, so nothing to flush and unmount.


MessageError: Error: credential propagation was unsuccessful

In [3]:

import os
import re
import random
import pandas as pd
from sklearn.model_selection import train_test_split

ORIG_DIRS = [
    "/content/drive/MyDrive/faceforensics++/original_sequences/actors/c40/videos",
    "/content/drive/MyDrive/faceforensics++/original_sequences/youtube/c40/videos"
]
MANIP_DIRS = [
    "/content/drive/MyDrive/faceforensics++/manipulated_sequences/DeepFakeDetection/c40/videos",
    "/content/drive/MyDrive/faceforensics++/manipulated_sequences/Deepfakes/c40/videos",
    "/content/drive/MyDrive/faceforensics++/manipulated_sequences/Face2Face/c40/videos"
]

ADV_DIR = "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences"


OUTPUT_DIR = "/content/drive/MyDrive/csc490/code_and_datasets/video_splits_final"
os.makedirs(OUTPUT_DIR, exist_ok=True)

RANDOM_STATE = 42
TEST_SIZE = 0.20
VAL_SIZE  = 0.20


def list_videos(path):
    if not os.path.exists(path): return {}
    return {f: os.path.join(path, f) for f in os.listdir(path) if f.lower().endswith((".mp4",".avi",".mov",".mkv"))}

def extract_id(filename):
    base = os.path.splitext(os.path.basename(filename))[0]
    patterns = [r'^(\d+_\d+)', r'^(\d+)', r'^([A-Za-z0-9]+?)(?:[_-]\d+)']
    for p in patterns:
        m = re.match(p, base)
        if m: return m.group(1)
    return base

print(" Collecting files...")
orig_files, manip_files, adv_files = {}, {}, {}

for d in ORIG_DIRS: orig_files.update(list_videos(d))
for d in MANIP_DIRS: manip_files.update(list_videos(d))
for root, _, files in os.walk(ADV_DIR):
    for f in files:
        if f.lower().endswith((".mp4",".avi",".mov",".mkv")):
            adv_files[f] = os.path.join(root, f)

print(f"Found: Orig({len(orig_files)}), Manip({len(manip_files)}), Adv({len(adv_files)})")

# Group by Identity
groups = {}
def add_to_group(gid, kind, path):
    if gid not in groups: groups[gid] = {"original":[], "manipulated":[], "adversarial":[]}
    groups[gid][kind].append(path)

for f, p in orig_files.items(): add_to_group(extract_id(f), "original", p)
for f, p in manip_files.items(): add_to_group(extract_id(f), "manipulated", p)
for f, p in adv_files.items(): add_to_group(extract_id(f), "adversarial", p)

all_ids = sorted(list(groups.keys()))
print(f"Total Identities: {len(all_ids)}")

# Split IDs
train_ids, temp_ids = train_test_split(all_ids, test_size=(TEST_SIZE+VAL_SIZE), random_state=RANDOM_STATE, shuffle=True)
val_ratio = VAL_SIZE / (TEST_SIZE + VAL_SIZE)
val_ids, test_ids = train_test_split(temp_ids, test_size=(1-val_ratio), random_state=RANDOM_STATE, shuffle=True)

# Generate Lists
def get_paths(id_list, orig=False, manip=False, adv=False):
    paths = []
    for gid in id_list:
        if orig: paths.extend(groups[gid]['original'])
        if manip: paths.extend(groups[gid]['manipulated'])
        if adv: paths.extend(groups[gid]['adversarial'])
    return paths

splits = {
    "train.txt": get_paths(train_ids, orig=True, manip=True, adv=True),
    "val_clean.txt": get_paths(val_ids, orig=True, manip=True, adv=False),
    "val_adv.txt": get_paths(val_ids, orig=False, manip=False, adv=True),
    "test_clean.txt": get_paths(test_ids, orig=True, manip=True, adv=False),
    "test_adv.txt": get_paths(test_ids, orig=False, manip=False, adv=True)
}

for name, paths in splits.items():
    random.shuffle(paths)
    with open(os.path.join(OUTPUT_DIR, name), "w") as f:
        f.write("\n".join(paths))
    print(f" Saved {name}: {len(paths)} videos")

 Collecting files...
Found: Orig(0), Manip(0), Adv(0)
Total Identities: 0


ValueError: With n_samples=0, test_size=0.4 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.