# Data Exploration 

In [2]:
import pandas as pd
import os

In [4]:
# Load the CSV file into a pandas DataFrame
data_path = os.getcwd() + "/in-the-wild-audio-deepfake/meta.csv"
if not os.path.exists(data_path):
    print(f"File not found: {data_path}")
    exit()
df = pd.read_csv(data_path)

In [5]:
# Sum the occurrences of each label
total_spoof = (df['label'] == 'spoof').sum()
total_bonafide = (df['label'] == 'bona-fide').sum()

# Print the results
print(f"Total spoofed samples: {total_spoof}")
print(f"Total bona-fide samples: {total_bonafide}")

Total spoofed samples: 11816
Total bona-fide samples: 19963


In [6]:


# Pivot table to count the number of occurrences of each label per speaker
result = df.pivot_table(
    index='speaker',    # Rows will be unique speakers
    columns='label',    # Columns will be unique labels ('spoof', 'bona-fide')
    aggfunc='size',     # Count the number of rows for each speaker-label combination
    fill_value=0        # Fill missing combinations with 0 instead of NaN
).reset_index()         # Reset index to turn 'speaker' back into a normal column

# Optional: clean up column names for clarity
result.columns.name = None  # Remove the name that pivot_table adds to columns
result = result.rename(columns={
    'spoof': 'num_spoof',          # Rename 'spoof' column to 'num_spoof'
    'bona-fide': 'num_bona_fide'   # Rename 'bona-fide' column to 'num_bona_fide'
})

# Print the resulting DataFrame
print(result)


                     speaker  num_bona_fide  num_spoof
0                       2Pac             59        101
1                Adam Driver             80        137
2                 Alan Watts             86        292
3              Alec Guinness           1907       1718
4   Alexandria Ocasio-Cortez            290        100
5      Arnold Schwarzenegger            243        108
6                   Ayn Rand           1389       1104
7               Barack Obama           3302        334
8             Bernie Sanders           1623       1254
9                  Bill Burr            144         57
10              Bill Clinton           1129        703
11             Billie Eilish              9          5
12                  Bob Ross             60         48
13             Boris Johnson            209         86
14           Calvin Coolidge             58         15
15      Christopher Hitchens            798        541
16            Dave Chappelle             13          9
17        

In [7]:
import pandas as pd

# Example result DataFrame (replace this with your actual DataFrame)
# result = pd.read_csv("...")  # if loading from CSV

# Calculate total samples per speaker
result['total'] = result['num_spoof'] + result['num_bona_fide']

# Calculate the ratio of the minority class
result['min_ratio'] = result[['num_spoof', 'num_bona_fide']].min(axis=1) / result['total']

# Filter speakers where the minority class is less than 20% of total
significantly_imbalanced = result[result['min_ratio'] < 0.2]

# Drop helper columns if you want clean output
significantly_imbalanced = significantly_imbalanced.drop(columns=['total', 'min_ratio'])

# Print the result
print(significantly_imbalanced)


                 speaker  num_bona_fide  num_spoof
7           Barack Obama           3302        334
17          Donald Trump           3268        155
26         Jeff Goldblum             56          4
27        Jerry Seinfeld              8         43
30         Kamala Harris             25          4
32            Louis C.K.              9         57
34        Lyndon Johnson             95         12
39         Mitch Hedberg              3         28
42         Nick Offerman            158         11
44          Orson Welles             26        176
50  The Notorious B.I.G.             14        315
52    William F. Buckley              4         22


In [None]:

# Correct the imbalanced speakers
corrected_imbalanced = significantly_imbalanced.copy()
corrected_imbalanced['num_spoof'] = corrected_imbalanced[['num_spoof', 'num_bona_fide']].min(axis=1)
corrected_imbalanced['num_bona_fide'] = corrected_imbalanced[['num_spoof', 'num_bona_fide']].min(axis=1)

# Get the speakers that were already balanced
balanced_speakers = result.copy()
balanced_speakers = balanced_speakers.drop(columns=['total', 'min_ratio'])
balanced_speakers = balanced_speakers[~balanced_speakers['speaker'].isin(significantly_imbalanced['speaker'])]

# Combine both
final_df = pd.concat([balanced_speakers, corrected_imbalanced], ignore_index=True)
# Sum the total spoofed and bona-fide samples
total_spoof = final_df['num_spoof'].sum()
total_bona_fide = final_df['num_bona_fide'].sum()

print('---------------------------------------------------------------------------')
print(f"Total spoofed samples after correcting speaker imbalance: {total_spoof}")
print(f"Total bona-fide samples after correcting speaker imbalance: {total_bona_fide}")


                 speaker  num_bona_fide  num_spoof
7           Barack Obama            334        334
17          Donald Trump            155        155
26         Jeff Goldblum              4          4
27        Jerry Seinfeld              8          8
30         Kamala Harris              4          4
32            Louis C.K.              9          9
34        Lyndon Johnson             12         12
39         Mitch Hedberg              3          3
42         Nick Offerman             11         11
44          Orson Welles             26         26
50  The Notorious B.I.G.             14         14
52    William F. Buckley              4          4
                     speaker  num_bona_fide  num_spoof
0                       2Pac             59        101
1                Adam Driver             80        137
2                 Alan Watts             86        292
3              Alec Guinness           1907       1718
4   Alexandria Ocasio-Cortez            290        100
5      

In [25]:
from utils.utils import get_file_path
from utils.audio_utils import get_wav_duration
from tqdm import tqdm

dataset = os.path.join("in-the-wild-audio-deepfake", "release_in_the_wild")

speaker_durations = {}

#Iterate the dataframe
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing audio files"):
    speaker = row['speaker']
    label = row['label']
    file = row['file']
    
    if speaker not in speaker_durations:
        speaker_durations[speaker] = {'bona-fide': 0.0, 'spoof': 0.0, 'files_num': 0}
    
    try:
        audio_file_path = get_file_path(file=file, dataset_pathing=dataset, label=label)
        duration = get_wav_duration(audio_file_path)
        speaker_durations[speaker][label] += duration
        speaker_durations[speaker]['files_num'] += 1
    except FileNotFoundError as e:
        print(f"Warning: {e}")
        continue

duration_df = pd.DataFrame([
    {
        'speaker': speaker,
        'total_bona_fide_duration_sec': round(durations['bona-fide'], 4),
        'total_spoof_duration_sec': round(durations['spoof'], 4),
        'files_num': durations['files_num']
    }
    for speaker, durations in speaker_durations.items()
])

duration_df = duration_df.sort_values(by='total_spoof_duration_sec', ascending=False).reset_index(drop=True)

duration_df.to_csv("metadata/duration_df.csv", index=True)

pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

duration_df



Processing audio files: 100%|██████████| 31779/31779 [00:10<00:00, 3141.30it/s]


Unnamed: 0,speaker,total_bona_fide_duration_sec,total_spoof_duration_sec,files_num
0,Alec Guinness,5719.9129,10019.1668,3625
1,Bernie Sanders,6476.4183,6926.1252,2877
2,Ayn Rand,5097.5028,5469.5592,2493
3,Ronald Reagan,2063.5556,4423.3716,1536
4,Bill Clinton,3194.2682,3035.303,1832
5,Christopher Hitchens,3119.7987,2817.7714,1339
6,Martin Luther King,1671.6901,1558.9021,799
7,Queen Elizabeth II,627.5351,1486.3294,464
8,Mark Zuckerberg,2068.5109,1480.4574,582
9,Milton Friedman,1276.0489,1474.8289,589


In [None]:
female_speakers = [
    'Alexandria Ocasio-Cortez',
    'Ayn Rand',
    'Billie Eilish',
    'Kamala Harris',
    'Queen Elizabeth II',
    'Scarlett Johansson'
]

female_df = duration_df[duration_df['speaker'].isin(female_speakers)]
female_df

Unnamed: 0,speaker,total_bona_fide_duration_sec,total_spoof_duration_sec,files_num
2,Ayn Rand,5097.5028,5469.5592,2493
7,Queen Elizabeth II,627.5351,1486.3294,464
19,Alexandria Ocasio-Cortez,1354.3207,876.3967,390
36,Scarlett Johansson,200.2859,327.187,73
51,Billie Eilish,71.9954,66.869,14
53,Kamala Harris,126.2048,25.8331,29


In [33]:
male_df = duration_df[~duration_df['speaker'].isin(female_speakers)]
male_df

Unnamed: 0,speaker,total_bona_fide_duration_sec,total_spoof_duration_sec,files_num
0,Alec Guinness,5719.9129,10019.1668,3625
1,Bernie Sanders,6476.4183,6926.1252,2877
3,Ronald Reagan,2063.5556,4423.3716,1536
4,Bill Clinton,3194.2682,3035.303,1832
5,Christopher Hitchens,3119.7987,2817.7714,1339
6,Martin Luther King,1671.6901,1558.9021,799
8,Mark Zuckerberg,2068.5109,1480.4574,582
9,Milton Friedman,1276.0489,1474.8289,589
10,Barack Obama,13285.9256,1329.0767,3636
11,JFK,1150.0427,1179.6457,669


In [31]:
print(female_df[['total_spoof_duration_sec', 'total_bona_fide_duration_sec', 'files_num']].sum())

total_spoof_duration_sec        8252.1744
total_bona_fide_duration_sec    7477.8447
files_num                       3463.0000
dtype: float64


<h3>Comments</h3>
<p>Data set is moderately imbalanced the proposed solution described above identifines the impbalanced speakres and downsamples only those speakers. 
