In [2]:
#%pip install librosa
import pandas as pd 
import os 
import librosa 
import numpy as np 


In [3]:

meta = pd.read_csv("release_in_the_wild/meta.csv")
meta.head()
meta['label'].unique()
real_data = meta[meta['label']== "bona-fide"]
spoof_data = meta[meta['label']== "spoof"]

def load_bf_audio_files(sr = 22050):
    audio_files = []
    for file in real_data['file']: 
        y, _ = librosa.load(f'release_in_the_wild/{file}', sr = sr)
        audio_files.append(y)

    return audio_files

real_audio = load_bf_audio_files()
len(real_audio)
label = ["bona-fide"] * len(real_audio)
label[:10]

def load_spoof_audio_files(sr = 22050):
    audio_files = []
    for file in spoof_data['file']: 
        y, _ = librosa.load(f'release_in_the_wild/{file}', sr = sr)
        audio_files.append(y)

    return audio_files
spoof_audio = load_spoof_audio_files()
spoof_label = ['spoof'] * len(spoof_audio)
len(spoof_label)
label.extend(spoof_label)
len(label)
audio_data = real_audio.copy()
audio_data.extend(spoof_audio)
len(audio_data)

31779

In [6]:
array = pd.Series(label)
value_counts = array.value_counts()
print(value_counts)

bona-fide    19963
spoof        11816
dtype: int64


## Audio Feature Extraction

### Audio Features Description

#### Time-Domain Features 
These features focus on analyzing the audio waveform directly over time. 
1. **Zero Crossing Rate:**  Counts how often the audio signal crosses the zero amplitude line. It’s a measure of **noisiness** or **percussiveness**, with higher rates indicating noisier sounds.

2. **RMS (Root Mean Square Energy):**  Measures the **energy** (or loudness) of the audio signal over time. It provides a sense of how strong or weak the audio is at different moments.


#### Frequency-Domain Features 
These techniques convert audio from the time domain to the frequency domain, where more information about pitch and timbre is revealed.

1. **MFCCs (Mel-Frequency Cepstral Coefficients):**  A set of features that represent the **short-term power spectrum** of sound based on the Mel scale, which approximates human hearing. MFCCs are widely used in speech and audio processing, especially for tasks like **speech recognition** and **audio classification**. The first few coefficients capture the general shape of the spectrum (timbre), while the higher ones capture more detailed characteristics.

2. **Spectral Centroid:**  Indicates the "center of mass" of the spectrum, representing the **brightness** of the sound. Higher values correspond to brighter sounds with more high frequencies.

3. **Spectral Bandwidth:**  Measures the range of frequencies around the spectral centroid. It represents the **spread** or **width** of the spectrum, capturing whether the sound is concentrated in a narrow or wide range of frequencies.

4. **Spectral Rolloff:**  The frequency below which a certain percentage (e.g., 85%) of the total spectral energy lies. It’s often used to differentiate between **tonal** and **noisy** signals.


#### Time-Frequency Domain Features
These techniques combine both time and frequency information for better detection.

1. **Chroma STFT (Short-Time Fourier Transform):**  Represents the **pitch class profile**, showing how energy is distributed across different pitch classes (like musical notes) over time. Useful for identifying harmonic and melodic content in the audio.




In [12]:
# Sampling rate (standardize to 16 kHz)
target_sr = 16000

# Initialize an empty list to hold feature dictionaries
features_list = []

# Loop through each audio file path
for i in range(len(audio_data)):
    # Load audio file and resample to target_sr
    y = audio_data[i]
    sr = target_sr
    
    # Extract features
    chroma_stft = np.mean(librosa.feature.chroma_stft(y=y, sr=sr))  # Chroma STFT
    rms = np.mean(librosa.feature.rms(y=y))  # Root Mean Square Energy
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))  # Spectral Centroid
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))  # Spectral Bandwidth
    rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))  # Spectral Rolloff
    zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=y))  # Zero Crossing Rate
    
    # MFCCs (first 20 coefficients)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    mfccs_mean = np.mean(mfccs, axis=1)  # Take mean of each coefficient over time
    
    # Prepare feature dictionary for the current audio file
    features_dict = {
        'chroma_stft': chroma_stft,
        'rms': rms,
        'spectral_centroid': spectral_centroid,
        'spectral_bandwidth': spectral_bandwidth,
        'rolloff': rolloff,
        'zero_crossing_rate': zero_crossing_rate
    }
    
    # Add MFCCs (mfcc1, mfcc2, ..., mfcc20)
    for j in range(1, 21):
        features_dict[f'mfcc{j}'] = mfccs_mean[j-1]
    
    # Optionally, you can also add a label if you're working on a supervised task
    features_dict['label'] = label[i] # Example label
    print(i,label[i])
    
    # Append feature dictionary to the list
    features_list.append(features_dict)

# Convert the list of feature dictionaries to a pandas DataFrame
features_df = pd.DataFrame(features_list)

# Now you have a DataFrame with all extracted features
print(features_df.head())

0 bona-fide
1 bona-fide
2 bona-fide
3 bona-fide
4 bona-fide
5 bona-fide
6 bona-fide
7 bona-fide
8 bona-fide
9 bona-fide
10 bona-fide
11 bona-fide
12 bona-fide
13 bona-fide
14 bona-fide
15 bona-fide
16 bona-fide
17 bona-fide
18 bona-fide
19 bona-fide
20 bona-fide
21 bona-fide
22 bona-fide
23 bona-fide
24 bona-fide
25 bona-fide
26 bona-fide
27 bona-fide
28 bona-fide
29 bona-fide
30 bona-fide
31 bona-fide
32 bona-fide
33 bona-fide
34 bona-fide
35 bona-fide
36 bona-fide
37 bona-fide
38 bona-fide
39 bona-fide
40 bona-fide
41 bona-fide
42 bona-fide
43 bona-fide
44 bona-fide
45 bona-fide
46 bona-fide
47 bona-fide
48 bona-fide
49 bona-fide
50 bona-fide
51 bona-fide
52 bona-fide
53 bona-fide
54 bona-fide
55 bona-fide
56 bona-fide
57 bona-fide
58 bona-fide
59 bona-fide
60 bona-fide
61 bona-fide
62 bona-fide
63 bona-fide
64 bona-fide
65 bona-fide
66 bona-fide
67 bona-fide
68 bona-fide
69 bona-fide
70 bona-fide
71 bona-fide
72 bona-fide
73 bona-fide
74 bona-fide
75 bona-fide
76 bona-fide
77 bona-f

  return pitch_tuning(


9494 bona-fide
9495 bona-fide
9496 bona-fide
9497 bona-fide
9498 bona-fide
9499 bona-fide
9500 bona-fide
9501 bona-fide
9502 bona-fide
9503 bona-fide
9504 bona-fide
9505 bona-fide
9506 bona-fide
9507 bona-fide
9508 bona-fide
9509 bona-fide
9510 bona-fide
9511 bona-fide
9512 bona-fide
9513 bona-fide
9514 bona-fide
9515 bona-fide
9516 bona-fide
9517 bona-fide
9518 bona-fide
9519 bona-fide
9520 bona-fide
9521 bona-fide
9522 bona-fide
9523 bona-fide
9524 bona-fide
9525 bona-fide
9526 bona-fide
9527 bona-fide
9528 bona-fide
9529 bona-fide
9530 bona-fide
9531 bona-fide
9532 bona-fide
9533 bona-fide
9534 bona-fide
9535 bona-fide
9536 bona-fide
9537 bona-fide
9538 bona-fide
9539 bona-fide
9540 bona-fide
9541 bona-fide
9542 bona-fide
9543 bona-fide
9544 bona-fide
9545 bona-fide
9546 bona-fide
9547 bona-fide
9548 bona-fide
9549 bona-fide
9550 bona-fide
9551 bona-fide
9552 bona-fide
9553 bona-fide
9554 bona-fide
9555 bona-fide
9556 bona-fide
9557 bona-fide
9558 bona-fide
9559 bona-fide
9560 bona-

In [16]:
len(features_df)
features_df.to_excel("audio_features1.xlsx")

In [15]:
features_df['label'].unique()

array(['bona-fide', 'spoof'], dtype=object)