In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from python_speech_features import logfbank, mfcc

import scipy.io.wavfile
import sys
import os
import librosa


In [3]:
## DATASET PATH (RELATIVE) (not cross-OS who cares about shitty OS)
dataset_path = '../dataset/'
## Here we are interested in averaged dynamic annotations for both arousal and valence :
annotations_path = 'annotations/'
rating_mode = 'averaged_per_song/' #(per_each_rater/averaged_per_song)
time_continuity_mode = 'dynamic/'#(song_level/dynamic)
full_data_path = dataset_path + annotations_path + rating_mode + time_continuity_mode
## Here is the path to the audio recordings :
audio_path = 'MEMD_audio/'
full_audio_path = dataset_path + audio_path
## Here is the path to where we write the csv
csv_write_path = dataset_path + 'emotion_by_song/'

In [20]:
feature_path = '../features/'

In [4]:
#Arousal data loading
arousal_dataframe = pd.read_csv(full_data_path + 'arousal' + '.csv')
arousal_dataframe.keys()
arousal_dataframe = arousal_dataframe.drop(columns='sample_626500ms')
arousal_dataframe.keys()
#print(arousal_dataframe.shape)
#print(arousal_dataframe['song_id'])

Index(['song_id', 'sample_15000ms', 'sample_15500ms', 'sample_16000ms',
       'sample_16500ms', 'sample_17000ms', 'sample_17500ms', 'sample_18000ms',
       'sample_18500ms', 'sample_19000ms',
       ...
       'sample_621500ms', 'sample_622000ms', 'sample_622500ms',
       'sample_623000ms', 'sample_623500ms', 'sample_624000ms',
       'sample_624500ms', 'sample_625000ms', 'sample_625500ms',
       'sample_626000ms'],
      dtype='object', length=1224)

In [5]:
valence_dataframe = pd.read_csv(full_data_path + 'valence' + '.csv')
valence_dataframe.keys()

Index(['song_id', 'sample_15000ms', 'sample_15500ms', 'sample_16000ms',
       'sample_16500ms', 'sample_17000ms', 'sample_17500ms', 'sample_18000ms',
       'sample_18500ms', 'sample_19000ms',
       ...
       'sample_621500ms', 'sample_622000ms', 'sample_622500ms',
       'sample_623000ms', 'sample_623500ms', 'sample_624000ms',
       'sample_624500ms', 'sample_625000ms', 'sample_625500ms',
       'sample_626000ms'],
      dtype='object', length=1224)

In [59]:
def plot_emotional_trajectory(song_id):
    valence_id = (valence_dataframe.loc[valence_dataframe['song_id'] == song_id]
                                   .drop(columns='song_id')
                                   .values
                 )
    valence_id = np.ravel(valence_id)
    
    arousal_id = (arousal_dataframe.loc[arousal_dataframe['song_id'] == song_id]
                                   .drop(columns='song_id')
                                   .values
                 )
    arousal_id = np.ravel(arousal_id)
    plt.figure()
    plt.plot(valence_id, arousal_id)
    plt.show()

def compute_mfb_coef(sig, sample_rate):
    return logfbank(signal = sig,
                    samplerate=sample_rate,
                    winlen=0.020,
                    winstep=0.010,
                    nfilt=40,
                    nfft=512,
                    lowfreq=0,
                    highfreq=None,
                    preemph=0.97)

def slice_signal_by_time_steps(sig, sample_rate, time_step):
    index_step = int(time_step*sample_rate)
    n_time_steps = sig.shape[0]//index_step
    return sig[:n_time_steps*index_step].reshape((n_time_steps, index_step))

def audio_from_song_id(song_id):
        file_path = full_audio_path + str(song_id) + '.mp3'
        return librosa.load(file_path)

In [None]:
from tqdm import tqdm_notebook

for i in tqdm_notebook(range(arousal_dataframe.shape[0])):
    arousal_line = arousal_dataframe.loc[i]
    valence_line = valence_dataframe.loc[i]
    song_id_ar = int(arousal_line['song_id'])
    song_id_val = int(valence_line['song_id'])
    if song_id_ar == song_id_val:
        #Load the corresponding signal
        y, sampling_rate = audio_from_song_id(song_id_ar)
        #format it so that each row of the resulting matrix correspond to a 500 ms signal duration
        formatted_signal = slice_signal_by_time_steps(y, sampling_rate, 500*1e-3)
        #Remove the 30 first rows for which we don't have arousal/valence information
        formatted_signal = formatted_signal[30:,:]
        M = formatted_signal.shape[0]
        n_mfb = 40
        n_time_win = 49
        mfb_coefficients = np.zeros((M, n_time_win, n_mfb))
        for i in range(M):
            mfb_coefficients[i,:,:] = compute_mfb_coef(formatted_signal[i], sampling_rate)
        
        mfb_coefficients -= np.mean(np.mean(mfb_coefficients, axis=0), axis=0)
        mfb_coefficients = mfb_coefficients.reshape(M, n_mfb*n_time_win)
        
        arousal_line_vector = arousal_line.drop('song_id').values[:M]
        valence_line_vector = valence_line.drop('song_id').values[:M]
        
        emotional_vector = np.stack([arousal_line_vector, valence_line_vector]).T
        
        #save results
        outputfile = feature_path + 'song_' + str(song_id_ar) + '.npz'
        np.savez(outputfile, mfb = mfb_coefficients, emotion = emotional_vector)
        

        

HBox(children=(IntProgress(value=0, max=1802), HTML(value='')))