In [1]:
import os
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from utils.audio_utils import get_sample_rate

In [None]:
def process_file(args):
    filepath, filename, category = args
    try:
        sample_rate = get_sample_rate(filepath)
        return {"filename": filename, "category": category, "sample_rate": sample_rate, "filepath": filepath, "error": None}
    except Exception as e:
        return {"filename": filename, "category": category, "sample_rate": "error", "filepath": filepath, "error": str(e)}

def analyze_sample_rates(base_path, max_workers=12):
    categories = ["real", "fake"]
    tasks = []
    
    for category in categories:
        folder_path = os.path.join(base_path, "training", category)
        if not os.path.exists(folder_path):
            print(f"Warning: {folder_path} does not exist")
            continue
        
        files = os.listdir(folder_path)
        print(f"Found {len(files)} files in {category}")
        
        for filename in files:
            filepath = os.path.join(folder_path, filename)
            tasks.append((filepath, filename, category))
    
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_file, task): task for task in tasks}
        for future in tqdm(as_completed(futures), total=len(tasks), desc="Processing"):
            results.append(future.result())
    
    return pd.DataFrame(results)


In [None]:
base_path = "/mnt/d/for-dataset/for-original/for-original"
df = analyze_sample_rates(base_path)


Found 26941 files in real
Found 26941 files in fake


Processing:  99%|█████████▉| 53535/53882 [00:43<00:00, 1191.74it/s]

In [None]:
summary = df.groupby(["category", "sample_rate"]).size().unstack(fill_value=0)
print("Sample Rate Distribution by Category:")
summary


In [None]:
print(f"Total files analyzed: {len(df)}")
print(f"Unique sample rates found: {df['sample_rate'].nunique()}")
print(f"\nSample rates: {sorted(df['sample_rate'].unique())}")


In [None]:
errors = df[df["sample_rate"] == "error"]
print(f"Total errors: {len(errors)}")
errors[["filename", "category", "error"]]


In [None]:
base_path = "/mnt/d/for-dataset/for-norm/for-norm"
df = analyze_sample_rates(base_path)

In [None]:
summary = df.groupby(["category", "sample_rate"]).size().unstack(fill_value=0)
print("Sample Rate Distribution by Category:")
summary


In [None]:
print(f"Total files analyzed: {len(df)}")
print(f"Unique sample rates found: {df['sample_rate'].nunique()}")
print(f"\nSample rates: {sorted(df['sample_rate'].unique())}")

In [None]:
errors = df[df["sample_rate"] == "error"]
print(f"Total errors: {len(errors)}")
errors[["filename", "category", "error"]]