In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Installation
!pip install pandas scikit-learn



In [3]:
# Imports
import os
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [39]:
import os
import random
import pandas as pd
from sklearn.model_selection import train_test_split

# ---------------------------
# Folder paths
# ---------------------------
ORIG_DIRS = [
    "/content/drive/MyDrive/faceforensics++/original_sequences/actors/c40/videos",
    "/content/drive/MyDrive/faceforensics++/original_sequences/youtube/c40/videos"
]
MANIP_DIRS = [
    "/content/drive/MyDrive/faceforensics++/manipulated_sequences/DeepFakeDetection/c40/videos",
    "/content/drive/MyDrive/faceforensics++/manipulated_sequences/Deepfakes/c40/videos"
]
ADV_DIR = "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM"

BASELINE_OUTPUT_DIR = "/content/drive/MyDrive/deepfake_detection_project/Dataset_split/baseline_splits"
ADV_OUTPUT_DIR      = "/content/drive/MyDrive/deepfake_detection_project/Dataset_split/adversarial_splits"
os.makedirs(BASELINE_OUTPUT_DIR, exist_ok=True)
os.makedirs(ADV_OUTPUT_DIR, exist_ok=True)

# ---------------------------
# List videos
# ---------------------------
def list_videos(path):
    if not os.path.exists(path):
        return {}
    return {f: os.path.join(path, f) for f in os.listdir(path) if f.lower().endswith((".mp4",".avi",".mov",".mkv"))}

orig_files, manip_files = {}, {}
for d in ORIG_DIRS:
    orig_files.update(list_videos(d))
for d in MANIP_DIRS:
    manip_files.update(list_videos(d))

# Adversarial folders
ADV_DFD_DIR = os.path.join(ADV_DIR, "VGGFace2/InceptionResnetV1/DeepFakeDetection/Epsilon0.05")
ADV_DF_DIR  = os.path.join(ADV_DIR, "VGGFace2/InceptionResnetV1/Deepfakes/Epsilon0.05")
adv_dfd_files = list_videos(ADV_DFD_DIR)
adv_df_files  = list_videos(ADV_DF_DIR)

# ---------------------------
# Map all videos
# ---------------------------
video_map = {}
all_files = set(list(orig_files.keys()) + list(manip_files.keys()) +
                list(adv_dfd_files.keys()) + list(adv_df_files.keys()))
for vid in all_files:
    video_map[vid] = {
        "original": orig_files.get(vid),
        "manipulated": manip_files.get(vid),
        "adversarial": None
    }

# Add all adversarial videos
for root, dirs, files in os.walk(ADV_DIR):
    for f in files:
        if f.lower().endswith((".mp4",".avi",".mov",".mkv")):
            if f in video_map:
                video_map[f]["adversarial"] = os.path.join(root, f)
            else:
                video_map[f] = {"original": None, "manipulated": None, "adversarial": os.path.join(root, f)}

# ---------------------------
# Function to enforce balance
# ---------------------------
def sample_videos(paths, ratio, n):
    """paths: list of full paths, ratio: portion of n, n: total samples"""
    k = int(n*ratio)
    return random.sample(paths, min(len(paths), k))

# ---------------------------
# Generate splits
# ---------------------------
def generate_splits(video_map, train_size=1000, val_size=200, test_size=200):
    # Collect all full paths
    orig_paths = [v["original"] for v in video_map.values() if v["original"]]
    manip_paths = [v["manipulated"] for v in video_map.values() if v["manipulated"]]
    adv_paths = [v["adversarial"] for v in video_map.values() if v["adversarial"]]

    # Shuffle to randomize
    random.shuffle(orig_paths)
    random.shuffle(manip_paths)
    random.shuffle(adv_paths)

    # --- Baseline: 50% orig / 50% manip ---
    baseline_train = sample_videos(orig_paths, 0.5, train_size) + sample_videos(manip_paths, 0.5, train_size)
    baseline_val   = sample_videos(orig_paths, 0.5, val_size) + sample_videos(manip_paths, 0.5, val_size)
    baseline_test  = sample_videos(orig_paths, 0.5, test_size) + sample_videos(manip_paths, 0.5, test_size)

    # --- Adv/fine-tune: 50% orig / 25% manip / 25% adv ---
    adv_train = sample_videos(orig_paths, 0.5, train_size) + \
                sample_videos(manip_paths, 0.25, train_size) + \
                sample_videos(adv_paths, 0.25, train_size)

    adv_val   = sample_videos(orig_paths, 0.5, val_size) + \
                sample_videos(manip_paths, 0.25, val_size) + \
                sample_videos(adv_paths, 0.25, val_size)

    adv_test  = sample_videos(orig_paths, 0.5, test_size) + \
                sample_videos(manip_paths, 0.25, test_size) + \
                sample_videos(adv_paths, 0.25, test_size)

    # Shuffle final lists
    for lst in [baseline_train, baseline_val, baseline_test, adv_train, adv_val, adv_test]:
        random.shuffle(lst)

    return baseline_train, baseline_val, baseline_test, adv_train, adv_val, adv_test

base_train, base_val, base_test, adv_train, adv_val, adv_test = generate_splits(video_map)

# ---------------------------
# Save function
# ---------------------------
def save_list(path, items):
    with open(path, "w") as f:
        f.write("\n".join(items))
    print(f"Saved → {path} ({len(items)} items)")

# Save baseline
save_list(os.path.join(BASELINE_OUTPUT_DIR,"train.txt"), base_train)
save_list(os.path.join(BASELINE_OUTPUT_DIR,"val.txt"), base_val)
save_list(os.path.join(BASELINE_OUTPUT_DIR,"test.txt"), base_test)

# Save adversarial/fine-tune
save_list(os.path.join(ADV_OUTPUT_DIR,"train.txt"), adv_train)
save_list(os.path.join(ADV_OUTPUT_DIR,"val.txt"), adv_val)
save_list(os.path.join(ADV_OUTPUT_DIR,"test.txt"), adv_test)

# ---------------------------
# Save CSVs
# ---------------------------
def save_csv(path, video_map, train_list, val_list, test_list):
    rows=[]
    for vid, paths in video_map.items():
        split="other"
        for lst, name in zip([train_list,val_list,test_list],["train","val","test"]):
            if (paths["original"] in lst) or (paths["manipulated"] in lst) or (paths["adversarial"] in lst):
                split=name
                break
        rows.append({
            "identity":vid,
            "original_path":paths["original"],
            "manipulated_path":paths["manipulated"],
            "adversarial_path":paths["adversarial"],
            "split":split
        })
    df=pd.DataFrame(rows)
    df.to_csv(path,index=False)
    print(f"Saved CSV → {path}")

# Save CSVs
save_csv(os.path.join(BASELINE_OUTPUT_DIR,"split.csv"), video_map, base_train, base_val, base_test)
save_csv(os.path.join(ADV_OUTPUT_DIR,"split.csv"), video_map, adv_train, adv_val, adv_test)


Saved → /content/drive/MyDrive/deepfake_detection_project/Dataset_split/baseline_splits/train.txt (1000 items)
Saved → /content/drive/MyDrive/deepfake_detection_project/Dataset_split/baseline_splits/val.txt (200 items)
Saved → /content/drive/MyDrive/deepfake_detection_project/Dataset_split/baseline_splits/test.txt (200 items)
Saved → /content/drive/MyDrive/deepfake_detection_project/Dataset_split/adversarial_splits/train.txt (1000 items)
Saved → /content/drive/MyDrive/deepfake_detection_project/Dataset_split/adversarial_splits/val.txt (200 items)
Saved → /content/drive/MyDrive/deepfake_detection_project/Dataset_split/adversarial_splits/test.txt (200 items)
Saved CSV → /content/drive/MyDrive/deepfake_detection_project/Dataset_split/baseline_splits/split.csv
Saved CSV → /content/drive/MyDrive/deepfake_detection_project/Dataset_split/adversarial_splits/split.csv
