Songs - The participant's mood/color/emotion perception of songs
    SongID - ID (also filename) of song in our dataset
    Perceived - A list of emotions which the song perceived by the participant
        V, A - The position of the chosen perceived emotion in the valence-arousal space [0,1]
    SongColor - The participant's choice of best-matching color of a song in the HSV space (H - hue [0-1], S - saturation [0-1], V -value [0-1])


convert mp4 audio to delta capstrum feature vector?

input: dcf vector --> array of vectors

test/output: perceivedVA, songcolor --> sort by song_id (rows), columns will be HSV color values

In [1]:
from bs4 import BeautifulSoup
import os
import librosa
import numpy as np
from collections import defaultdict
import csv
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
def extract_features(file_path, sr=22050, duration=5.0, hop_size=2.5):
    """
    Extracts delta cepstrum features using a sliding window approach.
    :param file_path: Path to the audio file.
    :param sr: Sampling rate.
    :param duration: Fixed length of each segment (seconds).
    :param hop_size: Overlapping step size between segments (seconds).
    :return: A list of feature arrays.
    """
    y, _ = librosa.load(file_path, sr=sr)
    segment_length = int(sr * duration)
    hop_length = int(sr * hop_size)

    features = []
    
    # Iterate over the audio in sliding windows
    for start in range(0, len(y) - segment_length + 1, hop_length):
        y_segment = y[start:start + segment_length]
        
        mfcc = librosa.feature.mfcc(y=y_segment, sr=sr, n_mfcc=13) # mel-frequency cepstrum features
        dcf = librosa.feature.delta(mfcc) # delta cepstrum features

        features.append(dcf)

    return np.array(features)

def average_hsv(hsv_colors):
    h_cos = np.cos(np.radians(hsv_colors[:, 0]))
    h_sin = np.sin(np.radians(hsv_colors[:, 0]))

    avg_h_cos = np.mean(h_cos)
    avg_h_sin = np.mean(h_sin)
    avg_h = np.degrees(np.arctan2(avg_h_sin, avg_h_cos)) % 360
    avg_s = np.mean(hsv_colors[:, 1])
    avg_v = np.mean(hsv_colors[:, 2])
    
    return avg_h, avg_s, avg_v

In [3]:
n_songs = len(os.listdir('../../../moodo/audio'))
for i, file in enumerate(os.listdir('../../../moodo/audio'), 1):
    
    song_id = file.split('.')[0]
    file_path = os.path.join('../../../moodo/audio', file)
    
    features = extract_features(file_path)
    
    os.makedirs('../../../moodo/audio_features2', exist_ok=True)
    
    np.save(f'../../../moodo/audio_features2/{song_id}.npy', features)
    print(f'Saved {i}/{n_songs} songs           ', end='\r')

Saved 200/200 songs           

In [4]:
with open('../../../moodo/dataset.xml', 'r') as file:
    soup = BeautifulSoup(file, 'xml')
    
hsv_values = defaultdict(list)
perceived_emotions = defaultdict(list)

emotions = []
rows = []

for item in soup.find_all('item'):
    for songs_survey_list in item.find_all('Songs'):
        for song in songs_survey_list.find_all('item'):
            song_id = song.find('SongID').text 
            song_color = song.find('SongColor') # participant's best match to a color in HSV
            
            h = float(song_color.find('H').text)
            s = float(song_color.find('S').text)
            v = float(song_color.find('V').text)
            
            perceived_emotions = song.find('Perceived') # participant's perceived emotions
            for emotion in perceived_emotions.find_all(True): # True: all tags inside 'Perceived' (e.g. 'Happy', 'Sad', 'Angry', etc.)
                emotion_name = emotion.name
                
                valence = emotion.find('V')
                arousal = emotion.find('A')
                
                if emotion_name not in emotions and emotion_name not in ['V', 'A']:
                    emotions.append(emotion_name)
                
                if valence and arousal:  
                    rows.append({
                        'SongID': song_id, 
                        'H': h, 
                        'S': s, 
                        'V': v,
                        'Val': float(valence.text),
                        'Ar': float(arousal.text),
                        'Emotion': emotion_name
                        })

df = pd.DataFrame(rows)

for song_id in df['SongID'].unique():
    song = df[df['SongID'] == song_id]
    
    emotions_listed = song['Emotion'].unique()
    n_participants = len(song)
    
    print(f'\nSongID {song_id} | {n_participants} participants')
    print(f'Emotions: {emotions_listed}')


SongID 101 | 50 participants
Emotions: ['Liveliness' 'Anger' 'Fear' 'Anticipation' 'Surprise' 'Longing'
 'Gloominess' 'Inspiring' 'Sadness']

SongID 153 | 83 participants
Emotions: ['Sadness' 'Relaxed' 'Calmness' 'Liveliness' 'Inspiring' 'Longing'
 'Dreamy' 'Joy' 'Happiness' 'Anticipation' 'Gloominess' 'Fear' 'Surprise'
 'Anger']

SongID 209 | 36 participants
Emotions: ['Dreamy' 'Gloominess' 'Calmness' 'Relaxed' 'Sadness' 'Happiness'
 'Longing']

SongID 242 | 74 participants
Emotions: ['Surprise' 'Calmness' 'Anticipation' 'Joy' 'Longing' 'Liveliness' 'Anger'
 'Fear' 'Dreamy' 'Inspiring' 'Relaxed' 'Gloominess']

SongID 504 | 63 participants
Emotions: ['Liveliness' 'Fear' 'Inspiring' 'Gloominess' 'Anticipation' 'Sadness'
 'Longing' 'Anger' 'Dreamy' 'Joy' 'Relaxed' 'Happiness']

SongID 513 | 94 participants
Emotions: ['Gloominess' 'Happiness' 'Longing' 'Relaxed' 'Dreamy' 'Sadness' 'Joy'
 'Liveliness' 'Surprise' 'Anticipation' 'Anger' 'Fear' 'Calmness'
 'Inspiring']

SongID 525 | 95 parti

In [5]:
encoder = LabelEncoder()
encoder.fit(emotions)
df["EmotionEncoded"] = encoder.transform(df["Emotion"])
emotion_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

print([f'{k}: {v}' for k, v in emotion_mapping.items()])
print(df.shape, df.columns)

['Anger: 0', 'Anticipation: 1', 'Calmness: 2', 'Dreamy: 3', 'Fear: 4', 'Gloominess: 5', 'Happiness: 6', 'Inspiring: 7', 'Joy: 8', 'Liveliness: 9', 'Longing: 10', 'Relaxed: 11', 'Sadness: 12', 'Surprise: 13']
(15212, 8) Index(['SongID', 'H', 'S', 'V', 'Val', 'Ar', 'Emotion', 'EmotionEncoded'], dtype='object')


In [None]:
feature_dir = "../../../moodo/audio_features2"

update_rows = []
for i, filename in enumerate(os.listdir(feature_dir)):
    print(f'{i}/{len(os.listdir(feature_dir))}', end='\r')
    
    song_id = filename.split('.')[0]
    file_path = os.path.join(feature_dir, filename)

    # shape: (n_segments, 13, 216)
    feature_matrix = np.load(file_path)
    n_segment = feature_matrix.shape[0]
    
    songs = df[df['SongID'] == song_id]
    
    # for every row with the same song_id, duplicate the row n_segment times and add each segment's features
    for _, song in songs.iterrows():
        for segment in feature_matrix:
            row = np.hstack([segment.flatten(), song.values])
            update_rows.append(row)

dataset = np.array(update_rows)
print(dataset.shape) # 13x216 = 2808   + 8 (targets) = 2816

(76034, 2816)


In [11]:
# double checking dimensions make sense
total_rows = 0
x=0
with open('../../../moodo/indices.txt', 'w') as file:
    for song_id in df['SongID'].unique():
        song = df[df['SongID'] == song_id]
        
        emotions_listed = song['Emotion'].unique()
        n_participants = len(song)
        
        file_path = os.path.join(feature_dir, f'{song_id}.npy')

        # shape: (n_segments, 13, 216)
        feature_matrix = np.load(file_path)
        n_segment = feature_matrix.shape[0]
        
        n_rows = n_participants*n_segment
        total_rows += n_rows
        print(f'SongID {song_id} | {n_rows} rows | indices: {x}:{x+n_rows}')
        file.write(f'{song_id},{n_rows},{x},{x+n_rows}\n')
        x += n_rows

print(f'Total rows: {total_rows}')

SongID 101 | 200 rows | indices: 0:200
SongID 153 | 415 rows | indices: 200:615
SongID 209 | 180 rows | indices: 615:795
SongID 242 | 592 rows | indices: 795:1387
SongID 504 | 315 rows | indices: 1387:1702
SongID 513 | 470 rows | indices: 1702:2172
SongID 525 | 475 rows | indices: 2172:2647
SongID 551 | 450 rows | indices: 2647:3097
SongID 607 | 240 rows | indices: 3097:3337
SongID 618 | 230 rows | indices: 3337:3567
SongID 104 | 440 rows | indices: 3567:4007
SongID 172 | 336 rows | indices: 4007:4343
SongID 204 | 685 rows | indices: 4343:5028
SongID 367 | 190 rows | indices: 5028:5218
SongID 503 | 365 rows | indices: 5218:5583
SongID 555 | 255 rows | indices: 5583:5838
SongID 565 | 390 rows | indices: 5838:6228
SongID 109 | 464 rows | indices: 6228:6692
SongID 155 | 296 rows | indices: 6692:6988
SongID 156 | 456 rows | indices: 6988:7444
SongID 181 | 192 rows | indices: 7444:7636
SongID 190 | 340 rows | indices: 7636:7976
SongID 196 | 350 rows | indices: 7976:8326
SongID 404 | 225 row

In [None]:
# write indices to a text file
with open('../../../moodo/indices.txt', 'w') as file:
    for song_id in df['SongID'].unique():
        song = df[df['SongID'] == song_id]

        emotions_listed = song['Emotion'].unique()
        n_participants = len(song)

        file_path = os.path.join(feature_dir, f'{song_id}.npy')

        # shape: (n_segments, 13, 216)
        feature_matrix = np.load(file_path)
        n_segment = feature_matrix.shape[0]

        n_rows = n_participants*n_segment
        file.write(f'{song_id} {n_rows}\n')

In [9]:
with open('../../../moodo/dataset2.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    
    # columns: 0:2808 -> features, 2808:2816 -> targets
    writer.writerows(dataset)