In [10]:
from datasets import load_dataset
import librosa
import numpy as np
import pandas as pd

# Load the dataset
dataset = load_dataset("audiofolder", data_dir="/kaggle/input/the-fake-or-real-dataset/for-2sec/for-2seconds")

Resolving data files:   0%|          | 0/13956 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/2826 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1088 [00:00<?, ?it/s]

In [11]:
import librosa
import numpy as np

def extract_features(audio_file):
    y, sr = librosa.load(audio_file, sr=None)
    
    # Adjust n_fft based on the length of the signal
    n_fft = 64  # Use a smaller n_fft for shorter signals
    
    # MFCC
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=60, n_fft=n_fft)
    mfcc_mean = np.mean(mfcc, axis=1)
    
    # Delta1 and Delta2
    delta1 = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)
    delta1_mean = np.mean(delta1, axis=1)
    delta2_mean = np.mean(delta2, axis=1)
    
    # ZCR
    zcr = np.mean(librosa.feature.zero_crossing_rate(y))
    
    # Spectral features
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr, n_fft=n_fft))
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=n_fft))
    rms = np.mean(librosa.feature.rms(y=y, frame_length=n_fft))
    
    # Chroma features
    chroma_stft = np.mean(librosa.feature.chroma_stft(y=y, sr=sr, n_fft=n_fft))
    chroma_cqt = np.mean(librosa.feature.chroma_cqt(y=y, sr=sr))
    chroma_cens = np.mean(librosa.feature.chroma_cens(y=y, sr=sr))
    
    # Entropy
    # Filter out non-positive values from mfcc_mean
    positive_mfcc_mean = mfcc_mean[mfcc_mean > 0]
    entropy = -np.sum(np.log(positive_mfcc_mean) * positive_mfcc_mean)
    
    # Spectral Flatness
    spectral_flatness = np.mean(librosa.feature.spectral_flatness(y=y, n_fft=n_fft))
    
    # Spectral Bandwidth
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr, n_fft=n_fft))
    
    # Spectral Contrast
    spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr, n_fft=n_fft))
    
    # Poly features
    poly = np.mean(librosa.feature.poly_features(y=y, sr=sr))
    
    # Tempogram
    tempogram = np.mean(librosa.feature.tempogram(y=y, sr=sr))
    
    # Tonnetz
    tonnetz = np.mean(librosa.feature.tonnetz(y=y, sr=sr))
    
    # Combine all features into a single array
    features = np.hstack([mfcc_mean, delta1_mean, delta2_mean, zcr, spectral_rolloff, spectral_centroid, rms, 
                          chroma_stft, chroma_cqt, chroma_cens, entropy, spectral_flatness, spectral_bandwidth, 
                          spectral_contrast, poly, tempogram, tonnetz])
    print (f"Length of Features extracted: {len(features)}")
    return features

In [12]:
def save_features_to_csv(dataset, subset_name):
    features_list = []
    labels = []
    
    for item in dataset[subset_name]:
        audio_file = item["audio"]["path"]  # Accessing the path within the 'audio' dictionary
        label = item["label"]
        features = extract_features(audio_file)
        features_list.append(features)
        labels.append(label)
    
    # Convert to DataFrame
    features_df = pd.DataFrame(features_list)
    features_df['label'] = labels
    
    # Save to CSV
    features_df.to_csv(f'{subset_name}_features.csv', index=False)

In [13]:
save_features_to_csv(dataset, "train")

Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Feat

  return pitch_tuning(


Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Feat

In [14]:
save_features_to_csv(dataset, "validation")

  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Feat

  return pitch_tuning(


Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Feat

In [15]:
save_features_to_csv(dataset, "test")

  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Features extracted: 194
Length of Feat