# Data Explration 

In [1]:
import pandas as pd

In [2]:

# Load the CSV file into a pandas DataFrame
df = pd.read_csv("../../in-the-wild-audio-deepfake/meta.csv")

In [3]:
# Sum the occurrences of each label
total_spoof = (df['label'] == 'spoof').sum()
total_bonafide = (df['label'] == 'bona-fide').sum()

# Print the results
print(f"Total spoofed samples: {total_spoof}")
print(f"Total bona-fide samples: {total_bonafide}")

Total spoofed samples: 11816
Total bona-fide samples: 19963


In [4]:


# Pivot table to count the number of occurrences of each label per speaker
result = df.pivot_table(
    index='speaker',    # Rows will be unique speakers
    columns='label',    # Columns will be unique labels ('spoof', 'bona-fide')
    aggfunc='size',     # Count the number of rows for each speaker-label combination
    fill_value=0        # Fill missing combinations with 0 instead of NaN
).reset_index()         # Reset index to turn 'speaker' back into a normal column

# Optional: clean up column names for clarity
result.columns.name = None  # Remove the name that pivot_table adds to columns
result = result.rename(columns={
    'spoof': 'num_spoof',          # Rename 'spoof' column to 'num_spoof'
    'bona-fide': 'num_bona_fide'   # Rename 'bona-fide' column to 'num_bona_fide'
})

# Print the resulting DataFrame
print(result)


                     speaker  num_bona_fide  num_spoof
0                       2Pac             59        101
1                Adam Driver             80        137
2                 Alan Watts             86        292
3              Alec Guinness           1907       1718
4   Alexandria Ocasio-Cortez            290        100
5      Arnold Schwarzenegger            243        108
6                   Ayn Rand           1389       1104
7               Barack Obama           3302        334
8             Bernie Sanders           1623       1254
9                  Bill Burr            144         57
10              Bill Clinton           1129        703
11             Billie Eilish              9          5
12                  Bob Ross             60         48
13             Boris Johnson            209         86
14           Calvin Coolidge             58         15
15      Christopher Hitchens            798        541
16            Dave Chappelle             13          9
17        

In [5]:
import pandas as pd

# Example result DataFrame (replace this with your actual DataFrame)
# result = pd.read_csv("...")  # if loading from CSV

# Calculate total samples per speaker
result['total'] = result['num_spoof'] + result['num_bona_fide']

# Calculate the ratio of the minority class
result['min_ratio'] = result[['num_spoof', 'num_bona_fide']].min(axis=1) / result['total']

# Filter speakers where the minority class is less than 20% of total
significantly_imbalanced = result[result['min_ratio'] < 0.2]

# Drop helper columns if you want clean output
significantly_imbalanced = significantly_imbalanced.drop(columns=['total', 'min_ratio'])

# Print the result
print(significantly_imbalanced)


                 speaker  num_bona_fide  num_spoof
7           Barack Obama           3302        334
17          Donald Trump           3268        155
26         Jeff Goldblum             56          4
27        Jerry Seinfeld              8         43
30         Kamala Harris             25          4
32            Louis C.K.              9         57
34        Lyndon Johnson             95         12
39         Mitch Hedberg              3         28
42         Nick Offerman            158         11
44          Orson Welles             26        176
50  The Notorious B.I.G.             14        315
52    William F. Buckley              4         22


In [None]:

# Correct the imbalanced speakers
corrected_imbalanced = significantly_imbalanced.copy()
corrected_imbalanced['num_spoof'] = corrected_imbalanced[['num_spoof', 'num_bona_fide']].min(axis=1)
corrected_imbalanced['num_bona_fide'] = corrected_imbalanced[['num_spoof', 'num_bona_fide']].min(axis=1)

# Get the speakers that were already balanced
balanced_speakers = result.copy()
balanced_speakers = balanced_speakers.drop(columns=['total', 'min_ratio'])
balanced_speakers = balanced_speakers[~balanced_speakers['speaker'].isin(significantly_imbalanced['speaker'])]

# Combine both
final_df = pd.concat([balanced_speakers, corrected_imbalanced], ignore_index=True)
# Sum the total spoofed and bona-fide samples
total_spoof = final_df['num_spoof'].sum()
total_bona_fide = final_df['num_bona_fide'].sum()

print('---------------------------------------------------------------------------')
print(f"Total spoofed samples after correcting speaker imbalance: {total_spoof}")
print(f"Total bona-fide samples after correcting speaker imbalance: {total_bona_fide}")


                 speaker  num_bona_fide  num_spoof
7           Barack Obama            334        334
17          Donald Trump            155        155
26         Jeff Goldblum              4          4
27        Jerry Seinfeld              8          8
30         Kamala Harris              4          4
32            Louis C.K.              9          9
34        Lyndon Johnson             12         12
39         Mitch Hedberg              3          3
42         Nick Offerman             11         11
44          Orson Welles             26         26
50  The Notorious B.I.G.             14         14
52    William F. Buckley              4          4
                     speaker  num_bona_fide  num_spoof
0                       2Pac             59        101
1                Adam Driver             80        137
2                 Alan Watts             86        292
3              Alec Guinness           1907       1718
4   Alexandria Ocasio-Cortez            290        100
5      

In [None]:
test=result.copy()
test['num_spoof'] = test[['num_spoof', 'num_bona_fide']].min(axis=1)
test['num_bona_fide'] = test[['num_spoof', 'num_bona_fide']].min(axis=1)
total_spoof = test['num_spoof'].sum()
total_bona_fide = test['num_bona_fide'].sum()
print(f"Total spoofed samples after correcting speaker imbalance: {total_spoof}")
print(f"Total bona-fide samples after correcting speaker imbalance: {total_bona_fide}")

Total spoofed samples after correcting speaker imbalance: 9820
Total bona-fide samples after correcting speaker imbalance: 9820


: 

<h3>Comments</h3>
<p>Data set is moderately imbalanced the proposed solution displayed above identifines the impbalanced speakres and oversamples or downsamples only those speakers. 
