Songs - The participant's mood/color/emotion perception of songs
    SongID - ID (also filename) of song in our dataset
    Perceived - A list of emotions which the song perceived by the participant
        V, A - The position of the chosen perceived emotion in the valence-arousal space [0,1]
    SongColor - The participant's choice of best-matching color of a song in the HSV space (H - hue [0-1], S - saturation [0-1], V -value [0-1])


convert mp4 audio to delta capstrum feature vector?

input: dcf vector --> array of vectors

test/output: perceivedVA, songcolor --> sort by song_id (rows), columns will be HSV color values

In [41]:
from bs4 import BeautifulSoup
import os
import librosa
import numpy as np
from collections import defaultdict
import csv

In [42]:
def extract_features(file_path, sr=22050, duration=5.0, hop_size=2.5):
    """
    Extracts delta cepstrum features using a sliding window approach.
    :param file_path: Path to the audio file.
    :param sr: Sampling rate.
    :param duration: Fixed length of each segment (seconds).
    :param hop_size: Overlapping step size between segments (seconds).
    :return: A list of feature arrays.
    """
    y, _ = librosa.load(file_path, sr=sr)
    segment_length = int(sr * duration)
    hop_length = int(sr * hop_size)

    features = []
    
    # Iterate over the audio in sliding windows
    for start in range(0, len(y) - segment_length + 1, hop_length):
        y_segment = y[start:start + segment_length]
        
        mfcc = librosa.feature.mfcc(y=y_segment, sr=sr, n_mfcc=13) # mel-frequency cepstrum features
        dcf = librosa.feature.delta(mfcc) # delta cepstrum features

        features.append(dcf)

    return np.array(features)

In [43]:
n_songs = len(os.listdir('../../../moodo/audio'))
for i, file in enumerate(os.listdir('../../../moodo/audio'), 1):
    
    song_id = file.split('.')[0]
    file_path = os.path.join('../../../moodo/audio', file)
    
    features = extract_features(file_path)
    
    os.makedirs('../../../moodo/audio_features2', exist_ok=True)
    
    np.save(f'../../../moodo/audio_features2/{song_id}.npy', features)
    print(f'Saved {i}/{n_songs} songs           ', end='\r')

Saved 200/200 songs           

In [35]:
with open('../../../moodo/dataset.xml', 'r') as file:
    soup = BeautifulSoup(file, 'xml')
    
hsv_values = defaultdict(list)
perceived_emotions = defaultdict(list)

for item in soup.find_all('item'):
    for song in item.find_all('Songs'):
        for song_item in song.find_all('item'):
            song_id = song_item.find('SongID').text
            song_color = song_item.find('SongColor')
            
            h = float(song_color.find('H').text)
            s = float(song_color.find('S').text)
            v = float(song_color.find('V').text)
            
            hsv_values[song_id].append([h, s, v])
            
            perceived_emotions[song_id] = {}

            for emotion_tag in song_item.find_all('Perceived'):
                for emotion in emotion_tag.find_all(True):  # True: all tags inside 'Perceived'
                    emotion_name = emotion.name
                    v_value = emotion.find('V')
                    a_value = emotion.find('A')

                    # Ensure we only append valid values (not None)
                    if v_value and a_value:
                        v_value = float(v_value.text)
                        a_value = float(a_value.text)

                        # Store the emotion and its V and A values
                        if emotion_name not in perceived_emotions[song_id]:
                            perceived_emotions[song_id][emotion_name] = []

                        perceived_emotions[song_id][emotion_name].append([v_value, a_value])

# calculating averages for HSV and perceived emotions
avg_hsv = {
    song_id: np.mean(values, axis=0) for song_id, values in hsv_values.items()
}
for song_id, emotions in perceived_emotions.items():
    if emotions:
        most_common_emotion = max(emotions, key=lambda x: len(emotions[x]))
        avg_emotion = np.mean(emotions[most_common_emotion], axis=0)
        perceived_emotions[song_id] = (most_common_emotion, avg_emotion)

# printing results of data extraction/processing:
for song_id, hsv in avg_hsv.items():
    print(f"SongID {song_id}: HSV {hsv}, Perceived Emotion {perceived_emotions[song_id]}")


SongID 101: HSV [0.29333333 0.65833333 0.86666667], Perceived Emotion ('Surprise', array([0.8656, 0.5836]))
SongID 153: HSV [0.38761905 0.67857143 1.        ], Perceived Emotion ('Happiness', array([ 0.6426, -0.2361]))
SongID 209: HSV [0.47208333 0.6        1.        ], Perceived Emotion ('Gloominess', array([-0.5705, -0.8   ]))
SongID 242: HSV [0.23995098 0.59558824 0.80882353], Perceived Emotion ('Anger', array([-0.8787,  0.9377]))
SongID 504: HSV [0.18568376 0.56410256 0.74358974], Perceived Emotion ('Liveliness', array([0.1836, 0.8459]))
SongID 513: HSV [0.43851351 0.62837838 0.97297297], Perceived Emotion ('Dreamy', array([ 0.2426, -0.2426]))
SongID 525: HSV [0.28416667 0.75       0.975     ], Perceived Emotion ('Joy', array([0.8066, 0.7016]))
SongID 551: HSV [0.43062016 0.56976744 0.91860465], Perceived Emotion ('Sadness', array([0.2689, 0.6951]))
SongID 607: HSV [0.17824074 0.45833333 0.64583333], Perceived Emotion ('Surprise', array([-0.8197,  0.3016]))
SongID 618: HSV [0.22785

In [None]:
# Sort dictionaries by song_id numerically
sorted_avg_hsv = dict(sorted(avg_hsv.items(), key=lambda x: int(x[0])))
sorted_perceived_emotions = dict(sorted(perceived_emotions.items(), key=lambda x: int(x[0])))

# merge dictionaries to create final dataset array, each column would be h, s, v, emotion, v, a
dataset = []
for song_id in sorted_avg_hsv:
    h, s, v = sorted_avg_hsv[song_id]
    emotion, va_array = sorted_perceived_emotions[song_id]
    val, ar = va_array
    dataset.append([song_id, h, s, v, emotion, val, ar])


with open('../../../moodo/dataset.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['SongID', 'H', 'S', 'V', 'Emotion', 'Valence', 'Arousal'])
    writer.writerows(dataset)