In [None]:
import os
import subprocess
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from utils.audio_utils import get_sample_rate

from pathlib import Path
from configs.config import DATASET_PATH, ITW_DATASET_PATH, ELEVEN_LABS_DATASET_PATH

FFMPEG = "ffmpeg"

In [None]:
def process_file(args):
    filepath, filename, category = args
    try:
        sample_rate = get_sample_rate(filepath)
        return {"filename": filename, "category": category, "sample_rate": sample_rate, "filepath": filepath, "error": None}
    except Exception as e:
        return {"filename": filename, "category": category, "sample_rate": "error", "filepath": filepath, "error": str(e)}

def analyze_sample_rates(base_path, max_workers=12):
    categories = ["real", "fake"]
    tasks = []
    
    for category in categories:
        if Path(os.path.join(base_path, "training", category)).is_dir():
            folder_path = os.path.join(base_path, "training", category)
        elif Path(os.path.join(base_path, "testing", category)).is_dir():
            folder_path = os.path.join(base_path, "testing", category)
        elif Path(os.path.join(base_path, "validation", category)).is_dir():
            folder_path = os.path.join(base_path, "validation", category)
        elif Path(os.path.join(base_path, category)).is_dir():
            folder_path = os.path.join(base_path, category)
        else:
            print(f"Warning: {base_path} does not contain a {category} directory")
            continue
        if not os.path.exists(folder_path):
            print(f"Warning: {folder_path} does not exist")
            continue
        files = os.listdir(folder_path)
        
        for filename in files:
            if not filename.lower().endswith(('.wav', '.mp3', '.flac', '.ogg')):
                continue
            filepath = os.path.join(folder_path, filename)
            tasks.append((filepath, filename, category))
        
        print(f"Total files to process: {len(tasks)}")
        print(f"Found {len(files)} files in {category}")
    
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_file, task): task for task in tasks}
        for future in tqdm(as_completed(futures), total=len(tasks), desc="Processing"):
            results.append(future.result())
    
    return pd.DataFrame(results)


**Analyzing FoR Dataset**

In [None]:
df = analyze_sample_rates(DATASET_PATH)


In [None]:
summary = df.groupby(["category", "sample_rate"]).size().unstack(fill_value=0)
print("Sample Rate Distribution by Category:")
summary


In [None]:
print(f"Total files analyzed: {len(df)}")
print(f"Unique sample rates found: {df['sample_rate'].nunique()}")
print(f"\nSample rates: {sorted(df['sample_rate'].unique())}")


In [None]:
errors = df[df["sample_rate"] == "error"]
print(f"Total errors: {len(errors)}")
errors[["filename", "category", "error"]]


**Analyzing In The Wild Audio Deepfake Dataset**

In [None]:
df_itw = analyze_sample_rates(ITW_DATASET_PATH)

In [None]:
summary = df_itw.groupby(["category", "sample_rate"]).size().unstack(fill_value=0)
print("Sample Rate Distribution by Category:")
summary

In [None]:
print(f"Total files analyzed: {len(df_itw)}")
print(f"Unique sample rates found: {df_itw['sample_rate'].nunique()}")
print(f"\nSample rates: {sorted(df_itw['sample_rate'].unique())}")

errors = df_itw[df_itw["sample_rate"] == "error"]
print(f"Total errors: {len(errors)}")
errors[["filename", "category", "error"]]

<H3> Trim Silence

In [None]:
# -------- CONFIG --------
print(ITW_DATASET_PATH)
INPUT_DIR  = os.path.join(ITW_DATASET_PATH, "release_in_the_wild","fake")
OUTPUT_DIR = os.path.join(ITW_DATASET_PATH, "release_in_the_wild_trimmed","fake")
MAX_WORKERS = 10

def trim_file(args):
    in_path, out_path, ffmpeg = args
    cmd = [
        FFMPEG, "-y",
        "-i", in_path,
        "-af",
        "silenceremove="
        "start_periods=1:start_duration=0.3:start_threshold=-50dB:"
        "stop_periods=1:stop_duration=0.3:stop_threshold=-50dB",
        out_path
    ]

    try:
        subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
        return None
    except subprocess.CalledProcessError:
        return os.path.basename(in_path)

os.makedirs(OUTPUT_DIR, exist_ok=True)

tasks = []
for f in os.listdir(INPUT_DIR):
    if not f.lower().endswith(".wav"):
        continue
    in_path = os.path.join(INPUT_DIR, f)
    out_path = os.path.join(OUTPUT_DIR, f)
    tasks.append((in_path, out_path, FFMPEG))

failed = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(trim_file, task): task for task in tasks}
    for future in tqdm(as_completed(futures), total=len(tasks), desc="Trimming fake"):
        result = future.result()
        if result:
            failed.append(result)

if failed:
    print(f"Failed to normalize {len(failed)} files: {failed[:5]}{'...' if len(failed) > 5 else ''}")


C:/Users/geon9/MSc/audio-deepfake-detection/in-the-wild-audio-deepfake


Trimming fake: 100%|██████████| 11816/11816 [00:56<00:00, 209.84it/s]


In [None]:
# -------- CONFIG --------
INPUT_DIR  = os.path.join(ITW_DATASET_PATH, "release_in_the_wild","real")
OUTPUT_DIR = os.path.join(ITW_DATASET_PATH, "release_in_the_wild_trimmed","real")
MAX_WORKERS = 10

def trim_file(args):
    in_path, out_path, ffmpeg = args
    cmd = [
        FFMPEG, "-y",
        "-i", in_path,
        "-af",
        "silenceremove="
        "start_periods=1:start_duration=0.3:start_threshold=-50dB:"
        "stop_periods=1:stop_duration=0.3:stop_threshold=-50dB",
        out_path
    ]

    try:
        subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
        return None
    except subprocess.CalledProcessError:
        return os.path.basename(in_path)

os.makedirs(OUTPUT_DIR, exist_ok=True)

tasks = []
for f in os.listdir(INPUT_DIR):
    if not f.lower().endswith(".wav"):
        continue
    in_path = os.path.join(INPUT_DIR, f)
    out_path = os.path.join(OUTPUT_DIR, f)
    tasks.append((in_path, out_path, FFMPEG))

failed = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(trim_file, task): task for task in tasks}
    for future in tqdm(as_completed(futures), total=len(tasks), desc="Trimming real"):
        result = future.result()
        if result:
            failed.append(result)

if failed:
    print(f"Failed to normalize {len(failed)} files: {failed[:5]}{'...' if len(failed) > 5 else ''}")


Trimming real: 100%|██████████| 19963/19963 [01:35<00:00, 209.57it/s]


<H3> Loudness Normalization

In [None]:
# -------- CONFIG --------
INPUT_DIR  = os.path.join(ITW_DATASET_PATH, "release_in_the_wild_trimmed","fake")
OUTPUT_DIR = os.path.join(ITW_DATASET_PATH, "release_in_the_wild_trimmed_normalized","fake")
MAX_WORKERS = 23

TARGET_LUFS = -16
TRUE_PEAK   = -1.5
# ------------------------

def normalize_file(args):
    in_path, out_path, ffmpeg, target_lufs, true_peak = args
    cmd = [
        ffmpeg,
        "-y",
        "-i", in_path,
        "-filter:a",
        f"loudnorm=I={target_lufs}:TP={true_peak}",
        out_path
    ]
    try:
        subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
        return None
    except subprocess.CalledProcessError:
        return os.path.basename(in_path)

os.makedirs(OUTPUT_DIR, exist_ok=True)

tasks = []
for f in os.listdir(INPUT_DIR):
    if not f.lower().endswith(".wav"):
        continue
    in_path = os.path.join(INPUT_DIR, f)
    out_path = os.path.join(OUTPUT_DIR, f)
    tasks.append((in_path, out_path, FFMPEG, TARGET_LUFS, TRUE_PEAK))

failed = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(normalize_file, task): task for task in tasks}
    for future in tqdm(as_completed(futures), total=len(tasks), desc="Normalizing fake"):
        result = future.result()
        if result:
            failed.append(result)

if failed:
    print(f"Failed to normalize {len(failed)} files: {failed[:5]}{'...' if len(failed) > 5 else ''}")


Normalizing fake: 100%|██████████| 11816/11816 [01:05<00:00, 181.61it/s]


In [None]:
# -------- CONFIG --------
INPUT_DIR  = os.path.join(ITW_DATASET_PATH, "release_in_the_wild_trimmed","real")
OUTPUT_DIR = os.path.join(ITW_DATASET_PATH, "release_in_the_wild_trimmed_normalized", "real")
MAX_WORKERS = 8

TARGET_LUFS = -16
TRUE_PEAK   = -1.5
# ------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

tasks = []
for f in os.listdir(INPUT_DIR):
    if not f.lower().endswith(".wav"):
        continue
    in_path = os.path.join(INPUT_DIR, f)
    out_path = os.path.join(OUTPUT_DIR, f)
    tasks.append((in_path, out_path, FFMPEG, TARGET_LUFS, TRUE_PEAK))

failed = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(normalize_file, task): task for task in tasks}
    for future in tqdm(as_completed(futures), total=len(tasks), desc="Normalizing real"):
        result = future.result()
        if result:
            failed.append(result)

if failed:
    print(f"Failed to normalize {len(failed)} files: {failed[:5]}{'...' if len(failed) > 5 else ''}")


Normalizing real: 100%|██████████| 19963/19963 [02:50<00:00, 116.93it/s]


### Eleven labs audio files preprocessing

In [None]:
# -------- CONFIG --------
INPUT_DIR  = os.path.join(ELEVEN_LABS_DATASET_PATH, "fake")
OUTPUT_DIR = os.path.join(ELEVEN_LABS_DATASET_PATH, "normalized_fake_real", "fake")
MAX_WORKERS = 8

TARGET_LUFS =  -15.0 
TRUE_PEAK   = -1.5
# ------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

tasks = []
for f in os.listdir(INPUT_DIR):
    if not f.lower().endswith(".wav"):
        continue
    in_path = os.path.join(INPUT_DIR, f)
    out_path = os.path.join(OUTPUT_DIR, f)
    tasks.append((in_path, out_path, FFMPEG, TARGET_LUFS, TRUE_PEAK))

failed = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(normalize_file, task): task for task in tasks}
    for future in tqdm(as_completed(futures), total=len(tasks), desc="Normalizing real"):
        result = future.result()
        if result:
            failed.append(result)

if failed:
    print(f"Failed to normalize {len(failed)} files: {failed[:5]}{'...' if len(failed) > 5 else ''}")


In [None]:
# -------- CONFIG --------
INPUT_DIR  = os.path.join(ELEVEN_LABS_DATASET_PATH, "real")
OUTPUT_DIR = os.path.join(ELEVEN_LABS_DATASET_PATH, "normalized_real_real", "real")
MAX_WORKERS = 8

TARGET_LUFS = -18
TRUE_PEAK   = -1.5
# ------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

tasks = []
for f in os.listdir(INPUT_DIR):
    if not f.lower().endswith(".wav"):
        continue
    in_path = os.path.join(INPUT_DIR, f)
    out_path = os.path.join(OUTPUT_DIR, f)
    tasks.append((in_path, out_path, FFMPEG, TARGET_LUFS, TRUE_PEAK))

failed = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(normalize_file, task): task for task in tasks}
    for future in tqdm(as_completed(futures), total=len(tasks), desc="Normalizing real"):
        result = future.result()
        if result:
            failed.append(result)

if failed:
    print(f"Failed to normalize {len(failed)} files: {failed[:5]}{'...' if len(failed) > 5 else ''}")
