# Préparer le dataset

In [None]:
raw_data_folder = './data/genres_original/'

# take all folder in raw_data_folder
import os
genres = os.listdir(raw_data_folder)

print(genres)

In [None]:
from pydub import AudioSegment
import os

def split_wav_file(input_file_path, output_folder, index):
    # Vérifier si le dossier de sortie existe, sinon le créer
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    try: 
        sound = AudioSegment.from_wav(input_file_path)

        clip_duration = 30 * 1000  # Durée de chaque clip en millisecondes
        total_duration = len(sound)

        total_clips = total_duration // clip_duration

        for i in range(total_clips):
            start_time = i * clip_duration
            end_time = (i + 1) * clip_duration
            clip = sound[start_time:end_time]
            clip.export(os.path.join(output_folder, f"{index}_clip_{i + 1}.wav"), format="wav")
    except Exception as e:
        print(e)

thirty_seconds_folder = './data/genres_30s/'

for genre in genres:
    # create a folder for each genre,
    # then take all the music files in the genre folder
    # and separate them into 3s clips
    genre_folder = thirty_seconds_folder + genre
    if not os.path.exists(genre_folder):
        os.mkdir(genre_folder)

    genre_path = raw_data_folder + genre
    music_files = os.listdir(genre_path)
    print (music_files)

    # for each music file, split it into 3s clips
    for i in range(len(music_files)):
        music_file_path = genre_path + '/' + music_files[i]
        split_wav_file(music_file_path, genre_folder, i)
    

In [None]:
import librosa
import pandas as pd
import numpy as np

def extract_audio_features(file_path, label):
    try:
        sr = 44100
        # Chargement du fichier audio
        y, sr = librosa.load(file_path, sr=sr)

        # spectrogram, tempo, chroma, mfccs, spectral_contrast

        spectrogram = np.abs(librosa.stft(y))
        spectrogram_min = np.min(spectrogram)
        spectrogram_max = np.max(spectrogram)
        spectrogram_mean = np.mean(spectrogram)
        spectrogram_std = np.std(spectrogram)

        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)

        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        chroma_min = np.min(chroma)
        chroma_max = np.max(chroma)
        chroma_mean = np.mean(chroma)
        chroma_std = np.std(chroma)

        mfccs = librosa.feature.mfcc(y=y, sr=sr)
        mfccs_min = np.min(mfccs)
        mfccs_max = np.max(mfccs)
        mfccs_mean = np.mean(mfccs)

        spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        spectral_contrast_min = np.min(spectral_contrast)
        spectral_contrast_max = np.max(spectral_contrast)
        spectral_contrast_mean = np.mean(spectral_contrast)
        spectral_contrast_std = np.std(spectral_contrast)

        audio_features = {
            'label': label,
            'spectrogram_min': spectrogram_min,
            'spectrogram_max': spectrogram_max,
            'spectrogram_mean': spectrogram_mean,
            'spectrogram_std': spectrogram_std,
            'tempo': tempo,
            'chroma_min': chroma_min,
            'chroma_max': chroma_max,
            'chroma_mean': chroma_mean,
            'chroma_std': chroma_std,
            'mfccs_min': mfccs_min,
            'mfccs_max': mfccs_max,
            'mfccs_mean': mfccs_mean,
            'spectral_contrast_min': spectral_contrast_min,
            'spectral_contrast_max': spectral_contrast_max,
            'spectral_contrast_mean': spectral_contrast_mean,
            'spectral_contrast_std': spectral_contrast_std
        }
        return audio_features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# # test extract_audio_features pour 1 fichier
# file_test = './data/genres_3s/blues/clip_1.wav'

# extract_audio_features(file_test, 'blues')

In [None]:
df_list = []

for genre in genres:
    wav_folder = os.path.join(thirty_seconds_folder, genre)
    wav_files = [os.path.join(wav_folder, file) for file in os.listdir(wav_folder) if file.endswith('.wav')]

    for wav_file in wav_files:
        audio_features = extract_audio_features(wav_file, genre)
        if audio_features:
            df_list.append(audio_features)
        else:
            print(f"Skipping {wav_file} due to processing error.")
        
    print('Done for genre: ', genre, ' with ', len(wav_files), ' files')

version = 2
df = pd.DataFrame(df_list, columns=['label', 'spectrogram_min', 'spectrogram_max', 'spectrogram_mean', 'spectrogram_std', 'tempo', 'chroma_min', 'chroma_max', 'chroma_mean', 'chroma_std', 'mfccs_min', 'mfccs_max', 'mfccs_mean', 'spectral_contrast_min', 'spectral_contrast_max', 'spectral_contrast_mean', 'spectral_contrast_std'])
df.to_csv('./clean_data/v{version}_audio_features_3s.csv'.format(version=version), index=False)

# 17min for 3s wav
# 15m for 30s wav

In [None]:
# foreach genre in dataframes, cut the number of rows to the minimum number of rows
# for each genre
# get the minimum number of rows
# cut the dataframe to the minimum number of rows
# append the dataframe to the final dataframe

# get the minimum number of rows
min_rows = df['label'].value_counts().min()

# cut the dataframe to the minimum number of rows
df_final = pd.DataFrame()
for genre in genres:
    df_genre = df[df['label'] == genre]
    df_genre = df_genre.sample(min_rows)
    # concatenate the two dataframes
    df_final = pd.concat([df_final, df_genre], ignore_index=True)

# save the dataframe
df_final.to_csv('./clean_data/v{version}_audio_features_30s_equal.csv'.format(version=version), index=False)