In [1]:
import os
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from utils.audio_utils import get_sample_rate

In [4]:
def process_file(args):
    filepath, filename, category = args
    try:
        sample_rate = get_sample_rate(filepath)
        return {"filename": filename, "category": category, "sample_rate": sample_rate, "filepath": filepath, "error": None}
    except Exception as e:
        return {"filename": filename, "category": category, "sample_rate": "error", "filepath": filepath, "error": str(e)}

def analyze_sample_rates(base_path, max_workers=12):
    categories = ["real", "fake"]
    tasks = []
    
    for category in categories:
        folder_path = os.path.join(base_path, "training", category)
        if not os.path.exists(folder_path):
            print(f"Warning: {folder_path} does not exist")
            continue
        
        files = os.listdir(folder_path)
        print(f"Found {len(files)} files in {category}")
        
        for filename in files:
            filepath = os.path.join(folder_path, filename)
            tasks.append((filepath, filename, category))
    
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_file, task): task for task in tasks}
        for future in tqdm(as_completed(futures), total=len(tasks), desc="Processing"):
            results.append(future.result())
    
    return pd.DataFrame(results)


In [3]:
base_path = "/mnt/d/for-dataset/for-original/for-original"
df = analyze_sample_rates(base_path)


Found 26941 files in real
Found 26941 files in fake


Processing: 100%|██████████| 53882/53882 [00:43<00:00, 1241.60it/s]


In [5]:
summary = df.groupby(["category", "sample_rate"]).size().unstack(fill_value=0)
print("Sample Rate Distribution by Category:")
summary


Sample Rate Distribution by Category:


sample_rate,16000,22050,24000,44100,error
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
fake,16702,6888,3337,0,14
real,13863,10174,0,2904,0


In [6]:
print(f"Total files analyzed: {len(df)}")
print(f"Unique sample rates found: {df['sample_rate'].nunique()}")
print(f"\nSample rates: {sorted(df['sample_rate'].unique())}")


Total files analyzed: 53882
Unique sample rates found: 5


TypeError: '<' not supported between instances of 'str' and 'int'

In [7]:
errors = df[df["sample_rate"] == "error"]
print(f"Total errors: {len(errors)}")
errors[["filename", "category", "error"]]


Total errors: 14


Unnamed: 0,filename,category,error
30009,file13424.mp3,fake,soxi failed for /mnt/d/for-dataset/for-origina...
32064,file15746.mp3,fake,soxi failed for /mnt/d/for-dataset/for-origina...
32856,file16643.mp3,fake,soxi failed for /mnt/d/for-dataset/for-origina...
33511,file17407.mp3,fake,soxi failed for /mnt/d/for-dataset/for-origina...
33551,file17450.mp3,fake,soxi failed for /mnt/d/for-dataset/for-origina...
35654,file19851.mp3,fake,soxi failed for /mnt/d/for-dataset/for-origina...
42070,file27206.mp3,fake,soxi failed for /mnt/d/for-dataset/for-origina...
42469,file27643.mp3,fake,soxi failed for /mnt/d/for-dataset/for-origina...
42632,file27839.mp3,fake,soxi failed for /mnt/d/for-dataset/for-origina...
45402,file30959.mp3,fake,soxi failed for /mnt/d/for-dataset/for-origina...


In [8]:
base_path = "/mnt/d/for-dataset/for-norm/for-norm"
df = analyze_sample_rates(base_path)

Found 26941 files in real
Found 26927 files in fake


Processing: 100%|██████████| 53868/53868 [00:45<00:00, 1175.38it/s]


In [9]:
summary = df.groupby(["category", "sample_rate"]).size().unstack(fill_value=0)
print("Sample Rate Distribution by Category:")
summary


Sample Rate Distribution by Category:


sample_rate,16000
category,Unnamed: 1_level_1
fake,26927
real,26941


In [10]:
print(f"Total files analyzed: {len(df)}")
print(f"Unique sample rates found: {df['sample_rate'].nunique()}")
print(f"\nSample rates: {sorted(df['sample_rate'].unique())}")

Total files analyzed: 53868
Unique sample rates found: 1

Sample rates: [np.int64(16000)]


In [11]:
errors = df[df["sample_rate"] == "error"]
print(f"Total errors: {len(errors)}")
errors[["filename", "category", "error"]]

Total errors: 0


Unnamed: 0,filename,category,error
