In [13]:
import os
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from utils.audio_utils import get_sample_rate
from configs.config_local import DATASET_PATH, ITW_DATASET_PATH

In [25]:
from pathlib import Path

def process_file(args):
    filepath, filename, category = args
    try:
        sample_rate = get_sample_rate(filepath)
        return {"filename": filename, "category": category, "sample_rate": sample_rate, "filepath": filepath, "error": None}
    except Exception as e:
        return {"filename": filename, "category": category, "sample_rate": "error", "filepath": filepath, "error": str(e)}

def analyze_sample_rates(base_path, max_workers=12):
    categories = ["real", "fake"]
    tasks = []
    
    for category in categories:
        if Path(os.path.join(base_path, "training", category)).is_dir():
            folder_path = os.path.join(base_path, "training", category)
        elif Path(os.path.join(base_path, "testing", category)).is_dir():
            folder_path = os.path.join(base_path, "testing", category)
        elif Path(os.path.join(base_path, "validation", category)).is_dir():
            folder_path = os.path.join(base_path, "validation", category)
        elif Path(os.path.join(base_path, category)).is_dir():
            folder_path = os.path.join(base_path, category)
        else:
            print(f"Warning: {base_path} does not contain a {category} directory")
            continue
        if not os.path.exists(folder_path):
            print(f"Warning: {folder_path} does not exist")
            continue
        files = os.listdir(folder_path)
        
        for filename in files:
            if not filename.lower().endswith(('.wav', '.mp3', '.flac', '.ogg')):
                continue
            filepath = os.path.join(folder_path, filename)
            tasks.append((filepath, filename, category))
        
        print(f"Total files to process: {len(tasks)}")
        print(f"Found {len(files)} files in {category}")
    
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_file, task): task for task in tasks}
        for future in tqdm(as_completed(futures), total=len(tasks), desc="Processing"):
            results.append(future.result())
    
    return pd.DataFrame(results)


**Analyzing FoR Dataset**

In [8]:
df = analyze_sample_rates(DATASET_PATH)


Found 26941 files in real
Found 26927 files in fake


Processing: 100%|██████████| 53868/53868 [00:40<00:00, 1333.99it/s]


In [9]:
summary = df.groupby(["category", "sample_rate"]).size().unstack(fill_value=0)
print("Sample Rate Distribution by Category:")
summary


Sample Rate Distribution by Category:


sample_rate,16000
category,Unnamed: 1_level_1
fake,26927
real,26941


In [10]:
print(f"Total files analyzed: {len(df)}")
print(f"Unique sample rates found: {df['sample_rate'].nunique()}")
print(f"\nSample rates: {sorted(df['sample_rate'].unique())}")


Total files analyzed: 53868
Unique sample rates found: 1

Sample rates: [np.int64(16000)]


In [11]:
errors = df[df["sample_rate"] == "error"]
print(f"Total errors: {len(errors)}")
errors[["filename", "category", "error"]]


Total errors: 0


Unnamed: 0,filename,category,error


**Analyzing In The Wild Audio Deepfake Dataset**

In [26]:
df_itw = analyze_sample_rates(ITW_DATASET_PATH)

Total files to process: 19963
Found 39926 files in real
Total files to process: 31779
Found 23632 files in fake


Processing: 100%|██████████| 31779/31779 [00:24<00:00, 1311.03it/s]


In [27]:
summary = df_itw.groupby(["category", "sample_rate"]).size().unstack(fill_value=0)
print("Sample Rate Distribution by Category:")
summary

Sample Rate Distribution by Category:


sample_rate,16000
category,Unnamed: 1_level_1
fake,11816
real,19963


In [29]:
print(f"Total files analyzed: {len(df_itw)}")
print(f"Unique sample rates found: {df_itw['sample_rate'].nunique()}")
print(f"\nSample rates: {sorted(df_itw['sample_rate'].unique())}")

errors = df_itw[df_itw["sample_rate"] == "error"]
print(f"Total errors: {len(errors)}")
errors[["filename", "category", "error"]]

Total files analyzed: 31779
Unique sample rates found: 1

Sample rates: [np.int64(16000)]
Total errors: 0


Unnamed: 0,filename,category,error
