In [None]:
!pip install datasets

In [None]:
!git clone https://github.com/f90/jamendolyrics.git

In [None]:
import glob
import pandas as pd
import csv
import librosa

SR = 44100
mp3_path = '/content/jamendolyrics/mp3/'
csv_path = '/content/jamendolyrics/annotations/lines/'
file_names = sorted(glob.glob(csv_path + '*.csv'))

gather_n = 4 ################################################### Combine n data samples into one data
len_files = []
start_time = []
end_time = []
audio_types = []
lyrics = []
audio_dict = {} # load audio array in advance to use less RAM. key: 0~78, value: np.array

for f in file_names:
    c = pd.read_csv(f)
    len_files.append(len(c))

def preprocess_dataset():
    for num, filename in enumerate(file_names):
        len_csv = len_files[num]
        with open(f"{filename}", "r") as f:
            csv_data = csv.reader(f)
            audio_name = filename.split('/')[-1][:-4] + '.mp3'
            
            print(f'{num}th csv file (filename: {audio_name})')
            for i, line in enumerate(csv_data):
                if i== 0: # except first head
                    continue
                    
                if gather_n == 1: # not combine samples
                    st = float(line[0])
                    et = float(line[1])
                    lyric = line[2]
                    
                    start_time.append(st)
                    end_time.append(et)
                    lyrics.append(lyric)
                    audio_types.append(num)
                    
                else:
                    if i%gather_n == 1: # start gathering frames
                        st = float(line[0])
                        et = float(line[1])
                        lyric = line[2]
                    else:
                        et = float(line[1])
                        lyric += (' ' + line[2])

                    if i%gather_n == 0 or i == len_csv: # end of the group or the last line
                        start_time.append(st)
                        end_time.append(et)
                        lyrics.append(lyric)
                        audio_types.append(num)

        f.close()

In [None]:
preprocess_dataset()

In [None]:
# Load original audio arrays in advance
for i, filename in enumerate(file_names):
    audio_name = filename.split('/')[-1][:-4] + '.mp3'
    audio_dict[i], _ = librosa.load(mp3_path + audio_name, sr=SR)
    
trimmed_audios = []
def trim_audio(audio_name, st, et):
    start_frame = SR*st
    end_frame = SR*et
    audio  = audio_dict[audio_name] # sr=44100
    trimmed_audio = audio[int(start_frame):int(end_frame)+1]

    return trimmed_audio

In [None]:
for audio_type, st, et in zip(audio_types, start_time, end_time):
    trimmed_audios.append(trim_audio(audio_type, st, et))

In [None]:
# Pytorch Dataset
from torch.utils.data import Dataset

class PairedDataset(Dataset):
    def __init__(self, trimmed_audios, lyrics):
        self.lyrics = lyrics
        self.trimmed_audios = trimmed_audios

    def __len__(self):
        return len(self.lyrics)

    def __getitem__(self, index):
        return {'audio': self.trimmed_audios[index], 'text': self.lyrics[index]}

In [None]:
paired_dataset = PairedDataset(trimmed_audios, lyrics)

### Approach 1. Use pytorch dataset format (I'm not sure if this format is also available in huggingface Trainer)

In [None]:
### Approach 1. Use pytorch dataset format (I'm not sure if this format is also available in huggingface Trainer)

import torch
from torch.utils.data.dataset import random_split
len_full_dataset = len(paired_dataset)

# train samples
train_p = 0.8
len_train = int(len_full_dataset * train_p)

# valid samples
valid_p = 0.1
len_valid = int(len_full_dataset * 0.1)

train_dataset, valid_dataset = random_split(paired_dataset, [len_train, len_full_dataset-len_train])
valid_dataset, test_dataset = random_split(valid_dataset, [len_valid, len(valid_dataset)-len_valid])

In [None]:
# Check if the audio aligns with text well
import IPython.display as ipd
idx = 20
print(train_dataset[idx]['text'])
ipd.Audio(train_dataset[idx]['audio'], rate=44100)

### Approach 2. convert to huggingface's dataset format (Requires RAM a lot)

In [None]:
### Approach 2. convert to huggingface's dataset format (Requires RAM a lot)
from datasets import Dataset
dset = Dataset.from_list(paired_dataset)

jamendo_dataset = dset.train_test_split(test_size=0.2)
train_trainvalid_p = jamendo_dataset['test'].train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid_p = train_trainvalid_p['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
paired_ds = DatasetDict({
    'train': train_trainvalid_p['train'],
    'test': test_valid_p['test'],
    'valid': test_valid_p['train']})