In [1]:
!pip install mutagen
!pip install pydub
!pip install shutil



In [1]:
import os
import json
from pydub import AudioSegment
from pydub import AudioSegment
from mutagen.mp3 import MP3
from mutagen.wave import WAVE
from mutagen.easyid3 import EasyID3
import shutil

In [2]:
def get_metadata(file_path):
    try:
        audio = MP3(file_path, ID3=EasyID3)
        title = audio.get('title', [''])[0]
        artist = audio.get('artist', [''])[0]
        genre = audio.get('genre', [''])[0]
        bpm = audio.get('bpm', [''])[0]
        
        try:
            bpm = int(float(bpm))
        except ValueError:
            bpm = ''
            
        return title, artist, genre, bpm
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return '', '', '', ''

In [3]:
def copy_dataset(src_dir):
    parent_dir = os.path.dirname(src_dir)
    directory_name = 'dataset'
    dest_dir = os.path.join(parent_dir, f'{directory_name}_input')
    
    try:
        if not os.path.exists(dest_dir):
            shutil.copytree(src_dir, dest_dir)
            print(f"Directory copied from {src_dir} to {dest_dir}")
        else:
            print(f"Directory {dest_dir} already exists")
    except shutil.Error as e:
        print(f"Error copying directory: {e}")
    except OSError as e:
        print(f"OS error: {e}")

    return dest_dir

In [4]:
def process_audio_files(directory):
    track_data = []
    dataset_copy = copy_dataset(directory)
    
    parent_dir = os.path.dirname(dataset_copy)
    fragments_dir = os.path.join(parent_dir, 'dataset_fragments')
        
    for subdir, dirs, files in os.walk(dataset_copy):
        for file in files:
            filepath = os.path.join(subdir, file)
            file_extension = os.path.splitext(file)[1].lower()
            
            title, artist, genre, bpm = get_metadata(filepath)

            if file_extension in ['.mp3', '.wav']:
                try:
                    audio = AudioSegment.from_file(filepath)
                    data = {
                        "key": "",
                        "artist": artist,
                        "genre": genre,
                        "sample_rate": audio.frame_rate,
                        "file_extension": file_extension[1:],
                        "description": '',
                        "keywords": "",
                        "duration": int(audio.duration_seconds),
                        "bpm": bpm,
                        "title": title,
                        "name": os.path.splitext(file)[0],
                        "instrument": "",
                        "moods": [""]
                    }
                    original_json_path = os.path.join(subdir, os.path.splitext(file)[0] + '.json')
                    if not os.path.exists(original_json_path):
                        with open(original_json_path, 'w') as json_file:
                            json.dump(data, json_file, indent=4)
                        print(f'Original JSON file created for: {file}')
                    else:
                        print(f'Skipping {file} as JSON file already exists.')
 
                    files_json = {
                        "filepath": filepath,
                        "file_extension": file_extension,
                        "fragments_dir": fragments_dir,
                        "original_json_path": original_json_path
                        }
                    track_data.append(files_json)
                    
                except Exception as e:
                    print(f'Failed to process {file}: {e}') 
                    
    with open('track_data.json', 'w') as f:
        json.dump(track_data, f, indent=4)

In [19]:
def create_fragments_and_json(fragment_length_ms=15000):
    
    with open('track_data.json', 'r') as f:
        track_data = json.load(f)
    
    for data in track_data:
        filepath = data["filepath"]
        file_extension = data["file_extension"]
        fragments_dir = data["fragments_dir"]
        original_json_path = data["original_json_path"]
        
        audio = AudioSegment.from_file(filepath)
        file_base_name = os.path.splitext(os.path.basename(filepath))[0]
        
        with open(original_json_path, 'r') as json_file:
            data = json.load(json_file)
        
        if not os.path.exists(fragments_dir):
            os.makedirs(fragments_dir)
            print(f'Created directory for fragments: {fragments_dir}')
        
        for i in range(0, len(audio), fragment_length_ms):
            start_seconds = i // 1000
            fragment_name = f"{file_base_name}_fragment_{start_seconds}s{file_extension}"
            fragment_path = os.path.join(fragments_dir, fragment_name)
            fragment_json_path = os.path.join(fragments_dir, os.path.splitext(fragment_name)[0] + '.json')

            if not os.path.exists(fragment_path) and not os.path.exists(fragment_json_path):
                fragment = audio[i:i + fragment_length_ms]
                fragment.export(fragment_path, format=file_extension[1:])
                
                fragment_data = data.copy()
                fragment_data['name'] = fragment_name
                fragment_data['duration'] = int(fragment.duration_seconds)
                
                with open(fragment_json_path, 'w') as json_file:
                    json.dump(fragment_data, json_file, indent=4)
                print(f'Fragment JSON file created for: {fragment_name}')
            else:
                print(f'Skipping existing files for: {fragment_name}')

In [None]:
# Specify the path to the root directory of the dataset
process_audio_files(r'C:\Users\noname\Desktop\dataset_musicgen\dataset_original')

In [None]:
# You can edit the json files of the original compositions
# ...

In [None]:
# Create a fragments of music/ Default: fragment_length_ms=15000 (ms)
create_fragments_and_json()