# Load Dependencies

In [1]:
import pandas as pd
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import normalize
# from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
import keras
import tensorflow
from tensorflow.keras.layers import LSTM, Dense
from scripts import commons

# Pre-processing Data

## Import the dataframes

In [2]:
# Assuming you're in mtg-jamendo-dataset
input_file = "./data/autotagging_moodtheme.tsv"
tracks, tags, extra = commons.read_file(input_file)

tracks

Reading: 18486 tracks, 4506 albums, 1533 artists


{948: {'artist_id': 87,
  'album_id': 149,
  'path': '48/948.mp3',
  'duration': 212.7,
  'tags': ['mood/theme---background'],
  'genre': set(),
  'instrument': set(),
  'mood/theme': {'background'}},
 950: {'artist_id': 87,
  'album_id': 149,
  'path': '50/950.mp3',
  'duration': 248.0,
  'tags': ['mood/theme---background'],
  'genre': set(),
  'instrument': set(),
  'mood/theme': {'background'}},
 951: {'artist_id': 87,
  'album_id': 149,
  'path': '51/951.mp3',
  'duration': 199.7,
  'tags': ['mood/theme---background'],
  'genre': set(),
  'instrument': set(),
  'mood/theme': {'background'}},
 2165: {'artist_id': 326,
  'album_id': 347,
  'path': '65/2165.mp3',
  'duration': 229.0,
  'tags': ['mood/theme---film'],
  'genre': set(),
  'instrument': set(),
  'mood/theme': {'film'}},
 2263: {'artist_id': 320,
  'album_id': 366,
  'path': '63/2263.mp3',
  'duration': 494.7,
  'tags': ['mood/theme---melancholic'],
  'genre': set(),
  'instrument': set(),
  'mood/theme': {'melancholic'}},

In [3]:
# # Read the TSV file into a DataFrame
# df = pd.read_csv('./data/autotagging_moodtheme.tsv', sep = '\t', on_bad_lines = 'warn') # This loses about half the dataset which contains multiple tags

# # Filter rows where "PATH" starts with "00/"
# df = df[df['PATH'].str.startswith('00/')]

# # Filter rows where "TAGS" contain only the specified mood/theme tags
# allowed_tags = {'energetic', 'relaxing', 'emotional', 'dark', 'love', 'sad'}  # Use a set for faster membership check
# df['TAGS'] = df['TAGS'].apply(lambda tags: set(tags.split('---')))
# df = df[df['TAGS'].apply(lambda tags: any(tag in allowed_tags for tag in tags))]

# # Display the filtered DataFrame
# print(df)

In [4]:
# # Define the folder containing the audio files
# audio_folder = './audio_data'

# def get_features_and_labels(data):
#     features = []  # List to save features
#     labels = []  # List to save labels
#     for track_id, track_data in data.items():
#         # Check if the path starts with "00/"
#         if track_data['path'].startswith('00/'):
#             # Check if "mood/theme" contains only the specified words
#             if (tag in track_data['mood/theme'] for tag in ['energetic', 'relaxing', 'emotional', 'dark', 'love', 'sad']):
#                 # Load the audio file
#                 filename = track_data['path']
#                 updated_filename = filename.rsplit('.mp3', 1)[0] + '.low.mp3'
#                 filepath = os.path.join(audio_folder, updated_filename)
#                 y, sr = librosa.load(filepath, sr=28000)
#                 # Extract features
#                 # You can modify this part according to your feature extraction needs
#                 # For example, you can extract MFCCs as follows:
#                 mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=128)
#                 # Append features and labels
#                 features.append(mfccs)
#                 labels.append(track_id)  # Use track_id as label
#     # Convert lists to numpy arrays
#     features_array = np.array(features)
#     labels_array = np.array(labels)
#     return features_array, labels_array

# # Get features and labels
# X, y = get_features_and_labels(tracks)

# # Now X contains the features extracted from the audio files
# # and y contains the corresponding track IDs as labels

## Extract features and labels

In [5]:
import os
import librosa
import numpy as np

audio_folder = './audio_data'

def get_features_and_labels(data, max_len = None):
    features = []
    labels = []
    max_len = 0
    # audio_files = ['00/', '01/', '02/']

    for track_id, track_data in data.items():
        if track_data['path'].startswith('00/') or track_data['path'].startswith('01/') or track_data['path'].startswith('02/'):
            # if (tag in track_data['mood/theme'] for tag in ['energetic', 'relaxing', 'emotional', 'dark', 'love', 'sad']):
            if all(tag in ['energetic', 'relaxing', 'emotional', 'dark', 'love', 'sad'] for tag in track_data['mood/theme']):
                try:
                    # Load the audio file
                    filename = track_data['path']
                    updated_filename = filename.rsplit('.mp3', 1)[0] + '.low.mp3'
                    filepath = os.path.join(audio_folder, updated_filename)
                    y, sr = librosa.load(filepath, sr = 10000)

                    # Extract MFCCs
                    mfccs = librosa.feature.mfcc(y = y, sr = sr, n_mfcc = 30)
                    # print(f"The max length now is {max_len}; the mfccs.shape is {mfccs.shape}")

                    # Update max_len if necessary
                    max_len = max(max_len, mfccs.shape[1])

                    # Append MFCCs and track_id to lists
                    # features.append(mfccs.T)  # Transpose MFCCs array
                    features.append(mfccs)
                    labels.append(track_data['mood/theme'])
                except Exception as e:
                    print(f"Error processing track {track_id}: {e}")

    # Pad or truncate MFCCs arrays to ensure they all have the same length
    padded_features = []
    for mfccs in features:
        padded_mfccs = np.pad(mfccs, ((0, 0), (0, max_len - mfccs.shape[1])), mode = 'constant')
        padded_features.append(padded_mfccs)

    # Convert lists to numpy arrays
    features_array = np.array(padded_features)

    return features_array, labels

# Assuming 'tracks' is the dictionary containing track data
X, y = get_features_and_labels(tracks)


In [6]:
print("Features shape:", X.shape)
print(X)
print("Labels:", y)
print("Labels Length:", len(y))

Features shape: (83, 30, 21229)
[[[-6.89938599e+02 -6.88272034e+02 -6.88081238e+02 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 1.31159830e+00  3.45308399e+00  3.63504934e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 1.31091261e+00  2.88282299e+00  2.83167577e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  ...
  [ 1.15434670e+00  7.76952744e-01 -9.09457326e-01 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 1.14317143e+00  1.30873930e+00 -1.48938209e-01 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 1.13169348e+00  1.94296145e+00  7.55229592e-01 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]]

 [[-3.07097534e+02 -2.49472519e+02 -2.86456879e+02 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 5.79071083e+01  6.49621429e+01  6.74940643e+01 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [-6.10837326e+01 -5.58186188e+01 -4.82387085e+01 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+0

In [7]:
print("Features shape:", X[0].shape)
print(X[0])
print("Labels:", y[0])

Features shape: (30, 21229)
[[-6.8993860e+02 -6.8827203e+02 -6.8808124e+02 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 [ 1.3115983e+00  3.4530840e+00  3.6350493e+00 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 [ 1.3109126e+00  2.8828230e+00  2.8316758e+00 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 ...
 [ 1.1543467e+00  7.7695274e-01 -9.0945733e-01 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 [ 1.1431714e+00  1.3087393e+00 -1.4893821e-01 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 [ 1.1316935e+00  1.9429615e+00  7.5522959e-01 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]]
Labels: {'dark'}


## Represent the labels numerically (encoding)

In [8]:
# # Convert the labels to numerical labels using label encoding
# label_encoder = LabelEncoder()
# encoded_labels = [label_encoder.fit_transform(list(label)) for label in y]
# print(encoded_labels)

# # Pad or truncate the encoded labels to ensure they have the same length
# max_label_length = max(len(label) for label in encoded_labels)
# padded_labels = [np.pad(label, (0, max_label_length - len(label)), mode = 'constant') for label in encoded_labels]
# print(padded_labels)

# # Convert the list of padded labels into a numpy array
# labels_array = np.array(padded_labels)

# print(labels_array)

In [9]:
# Create a MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit the MultiLabelBinarizer and transform labels into binary arrays
labels_array = mlb.fit_transform(y)

print(labels_array)

[[1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]
 [0 1 1 0 0 0]
 [0 1 0 0 0 0]
 [0 1 0 0 0 0]
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]
 [0 1 0 0 0 0]
 [0 1 0 0 0 0]
 [0 1 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [0 0 0 0 1 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [0 0 0 1 0 0]
 [0 1 0 1 0 0]
 [0 1 0 0 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 0 0 0 1]
 [0 0 0 0 0 1]
 [0 0 0 0 0 1]
 [0 0 1 0 0 0]
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 0 0 1 0]
 [0 1 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [0 0 0 0 0 1]
 [0 1 0 0 0 0]
 [0 0 0 1 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 0 1 0 0]
 [0 0 0 1 0 0]
 [0 1 0 0 0 1]
 [0 0 0 0 1 0]
 [0 0 0 1 0 0]
 [0 1 0 0 0 0]
 [0 0 0 0 0 1]
 [0 0 1 0 0 1]
 [0 0 0 0 1 0]
 [1 0 0 0 0 0]
 [0 0 0 0 0 1]
 [0 0 0 1 0 0]
 [1 0 0 0 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 1 0 0 0 0]
 [0 0 1 0 0 0]
 [0 0 0 0 0 1]
 [0 1 0 0 0 0]
 [0 0 0 1 0 0]
 [0 0 0 0 0 1]
 [0 0 1 0 

## To decode the labels

In [10]:
# Inverse transform binary arrays back into sets of labels
decoded_labels = mlb.inverse_transform(labels_array)

print(decoded_labels)

[('dark',), ('dark',), ('relaxing',), ('relaxing',), ('emotional', 'energetic'), ('emotional',), ('emotional',), ('relaxing',), ('relaxing',), ('emotional',), ('emotional',), ('emotional',), ('dark',), ('dark',), ('dark',), ('relaxing',), ('energetic',), ('energetic',), ('dark',), ('dark',), ('dark',), ('dark',), ('love',), ('emotional', 'love'), ('emotional',), ('energetic',), ('energetic',), ('sad',), ('sad',), ('sad',), ('energetic',), ('relaxing',), ('relaxing',), ('energetic',), ('energetic',), ('relaxing',), ('emotional',), ('dark',), ('dark',), ('sad',), ('emotional',), ('love',), ('energetic',), ('energetic',), ('love',), ('love',), ('emotional', 'sad'), ('relaxing',), ('love',), ('emotional',), ('sad',), ('energetic', 'sad'), ('relaxing',), ('dark',), ('sad',), ('love',), ('dark',), ('energetic',), ('energetic',), ('energetic',), ('emotional',), ('energetic',), ('sad',), ('emotional',), ('love',), ('sad',), ('energetic',), ('relaxing',), ('love',), ('sad',), ('dark', 'sad'), (

## Normalise the features

In [11]:
X_normalised = np.array((X-np.min(X))/(np.max(X)-np.min(X)))
X_normalised = X_normalised/np.std(X_normalised)

In [12]:
X_normalised

array([[[ 0.        ,  0.09214583,  0.1026951 , ..., 38.14731   ,
         38.14731   , 38.14731   ],
        [38.219826  , 38.33823   , 38.348297  , ..., 38.14731   ,
         38.14731   , 38.14731   ],
        [38.219788  , 38.3067    , 38.30387   , ..., 38.14731   ,
         38.14731   , 38.14731   ],
        ...,
        [38.211132  , 38.190266  , 38.097023  , ..., 38.14731   ,
         38.14731   , 38.14731   ],
        [38.210514  , 38.21967   , 38.139076  , ..., 38.14731   ,
         38.14731   , 38.14731   ],
        [38.209885  , 38.254734  , 38.189068  , ..., 38.14731   ,
         38.14731   , 38.14731   ]],

       [[21.167616  , 24.353754  , 22.308857  , ..., 38.14731   ,
         38.14731   , 38.14731   ],
        [41.34904   , 41.73912   , 41.879116  , ..., 38.14731   ,
         38.14731   , 38.14731   ],
        [34.769936  , 35.06105   , 35.48015   , ..., 38.14731   ,
         38.14731   , 38.14731   ],
        ...,
        [38.333996  , 38.197433  , 37.933292  , ..., 3

## Extract training, test and validation datasets

In [13]:
# Split twice to get the validation set
X_train, X_test, y_train, y_test = train_test_split(X_normalised, y, test_size = 0.25, random_state = 123)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 123)

# Print the shapes
X_train.shape, X_test.shape, X_val.shape, len(y_train), len(y_test), len(y_val)

((46, 30, 21229), (21, 30, 21229), (16, 30, 21229), 46, 21, 16)

In [14]:
print(np.__version__)

1.26.4
