In [1]:
!pip install mutagen
!pip install pydub
!pip install shutil



In [1]:
import os
import json
from pydub import AudioSegment
from pydub import AudioSegment
from mutagen.mp3 import MP3
from mutagen.wave import WAVE
from mutagen.easyid3 import EasyID3
import shutil

In [2]:
def get_metadata(file_path):
    try:
        audio = MP3(file_path, ID3=EasyID3)
        title = audio.get('title', [''])[0]
        artist = audio.get('artist', [''])[0]
        genre = 'Trailer' #audio.get('genre', [''])[0]
        bpm = audio.get('bpm', [''])[0]
        
        try:
            bpm = int(float(bpm))
        except ValueError:
            bpm = ''
            
        return title, artist, genre, bpm
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return '', '', '', ''

In [3]:
def copy_dataset(src_dir):
    parent_dir = os.path.dirname(src_dir)
    directory_name = 'dataset'
    dest_dir = os.path.join(parent_dir, f'{directory_name}_input')
    
    try:
        if not os.path.exists(dest_dir):
            shutil.copytree(src_dir, dest_dir)
            print(f"Directory copied from {src_dir} to {dest_dir}")
        else:
            print(f"Directory {dest_dir} already exists")
    except shutil.Error as e:
        print(f"Error copying directory: {e}")
    except OSError as e:
        print(f"OS error: {e}")

    return dest_dir

In [4]:
def process_audio_files(directory):
    track_data = []
    dataset_copy = copy_dataset(directory)
    
    parent_dir = os.path.dirname(dataset_copy)
    fragments_dir = os.path.join(parent_dir, 'dataset_fragments')
        
    for subdir, dirs, files in os.walk(dataset_copy):
        for file in files:
            filepath = os.path.join(subdir, file)
            file_extension = os.path.splitext(file)[1].lower()
            
            title, artist, genre, bpm = get_metadata(filepath)

            if file_extension in ['.mp3', '.wav']:
                try:
                    audio = AudioSegment.from_file(filepath)
                    data = {
                        "key": "",
                        "artist": artist,
                        "genre": genre,
                        "sample_rate": audio.frame_rate,
                        "file_extension": file_extension[1:],
                        "description": '',
                        "keywords": "",
                        "duration": int(audio.duration_seconds),
                        "bpm": bpm,
                        "title": title,
                        "name": os.path.splitext(file)[0],
                        "instrument": "",
                        "moods": [""]
                    }
                    original_json_path = os.path.join(subdir, os.path.splitext(file)[0] + '.json')
                    if not os.path.exists(original_json_path):
                        with open(original_json_path, 'w') as json_file:
                            json.dump(data, json_file, indent=4)
                        print(f'Original JSON file created for: {file}')
                    else:
                        print(f'Skipping {file} as JSON file already exists.')
 
                    files_json = {
                        "filepath": filepath,
                        "file_extension": file_extension,
                        "fragments_dir": fragments_dir,
                        "original_json_path": original_json_path
                        }
                    track_data.append(files_json)
                    
                except Exception as e:
                    print(f'Failed to process {file}: {e}') 
                    
    with open('track_data.json', 'w') as f:
        json.dump(track_data, f, indent=4)

In [5]:
import os
import json
from pydub import AudioSegment
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_fragment(data, i, fragment_length_ms):
    filepath = data["filepath"]
    file_extension = data["file_extension"]
    fragments_dir = data["fragments_dir"]
    original_json_path = data["original_json_path"]
    
    audio = AudioSegment.from_file(filepath)
    file_base_name = os.path.splitext(os.path.basename(filepath))[0]
    
    with open(original_json_path, 'r') as json_file:
        data = json.load(json_file)
    
    start_seconds = i // 1000
    fragment_name = f"{file_base_name}_fragment_{start_seconds}s{file_extension}"
    fragment_path = os.path.join(fragments_dir, fragment_name)
    fragment_json_path = os.path.join(fragments_dir, os.path.splitext(fragment_name)[0] + '.json')
    
    if not os.path.exists(fragment_path) and not os.path.exists(fragment_json_path):
        fragment = audio[i:i + fragment_length_ms]
        fragment.export(fragment_path, format=file_extension[1:])
        
        fragment_data = data.copy()
        fragment_data['name'] = fragment_name
        fragment_data['duration'] = int(fragment.duration_seconds)
        
        with open(fragment_json_path, 'w') as json_file:
            json.dump(fragment_data, json_file, indent=4)
        
        return f'Fragment JSON file created for: {fragment_name}'
    else:
        return f'Skipping existing files for: {fragment_name}'

def create_fragments_and_json(fragment_length_ms=15000):
    with open('track_data.json', 'r') as f:
        track_data = json.load(f)
    
    results = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_fragment = {}
        for data in track_data:
            if not os.path.exists(data["fragments_dir"]):
                os.makedirs(data["fragments_dir"])
                print(f'Created directory for fragments: {data["fragments_dir"]}')
            
            audio = AudioSegment.from_file(data["filepath"])
            for i in range(0, len(audio), fragment_length_ms):
                future = executor.submit(process_fragment, data, i, fragment_length_ms)
                future_to_fragment[future] = i
        
        for future in as_completed(future_to_fragment):
            result = future.result()
            results.append(result)
            print(result)

In [6]:
# Specify the path to the root directory of the dataset
process_audio_files(r'D:\audiocraft_training\Action_Trailer')

Directory copied from D:\audiocraft_training\Action_Trailer to D:\audiocraft_training\dataset_input
Original JSON file created for: 001_ROCK022009_HIGHSPEED_RACE_A_SONOTON.mp3
Original JSON file created for: 001_SCDV099804_SINISTER_AGGRESSION_A_SONOTON.mp3
Original JSON file created for: 001_SCDV123801_CEREMONY_BY_FIRE_A_SONOTON.mp3
Original JSON file created for: 002_SCDV101425_METAL_POWER_A_SONOTON.mp3
Original JSON file created for: 002_SCDV102401_GET_OUT_OF_THE_BANK_A_SONOTON.mp3
Original JSON file created for: 002_STT007301_LET_THEM_KNOW_A_SONOTON.mp3
Original JSON file created for: 003_SCDV097504_COMPUTER_ANIMATION_GROOVE_SONOTON.mp3
Original JSON file created for: 003_SCDV102417_REVENGE_OF_THE_MACHINES_A_SONOTON.mp3
Original JSON file created for: 003_STT007501_DAY_OF_JUSTICE_A_SONOTON.mp3
Original JSON file created for: 004_ROCK020501_NIGHTMARE_CALL_A_SONOTON.mp3
Original JSON file created for: 004_SCDV100437_TWENTY_FOURS_A_SONOTON.mp3
Original JSON file created for: 004_STT007

In [None]:
# You can edit the json files of the original compositions
# ...

In [6]:
# Create a fragments of music/ Default: fragment_length_ms=15000 (ms)
create_fragments_and_json()

Created directory for fragments: D:\audiocraft_training\dataset_fragments
Fragment JSON file created for: ATMOS_259_18_Party_In_The_Jungle_Thomas_442770_fragment_0s.mp3
Fragment JSON file created for: ATMOS_85_18_Blunted_Beats_Everitt_3546_fragment_120s.mp3
Fragment JSON file created for: NSPS209_10_In Your Face_Full Mix_fragment_120s.mp3
Fragment JSON file created for: UPM_KOK2364_22_Everyday_s_Battle_Underscore_Underscore_Le_Quer_Prodhomme_782895_fragment_15s.mp3
Fragment JSON file created for: CHAP_321_5_Hunting_For_Victory_Pemberton_308466_fragment_45s.mp3
Fragment JSON file created for: UPM_KOK2413_7_Eternal_Honor_Instrumental_Davoli_936724_fragment_30s.mp3
Fragment JSON file created for: UPM_BR361_BRQ6_14_Ruined_Temple_Main_Track_Campbell_192582_fragment_120s.mp3
Fragment JSON file created for: ATMOS_6_93_The_Flying_Dutchman_-_Overture_Wagner_278016_fragment_0s.mp3
Fragment JSON file created for: ATMOS_258_3_Pump_To_The_Bass_Groves_Marett_427741_fragment_75s.mp3
Fragment JSON fil

In [1]:
import os
import shutil

def organize_files_by_size(source_folder, max_size=1e9):
    """
    Организует файлы в подпапки по заданному максимальному размеру в байтах.
    :param source_folder: Путь к папке с файлами для организации.
    :param max_size: Максимальный размер файлов в одной папке (по умолчанию 1 ГБ).
    """
    # Получаем список всех файлов в исходной папке
    files = [f for f in os.listdir(source_folder) if os.path.isfile(os.path.join(source_folder, f))]
    files.sort(key=lambda x: os.path.getsize(os.path.join(source_folder, x)))
    
    current_batch = []
    current_size = 0
    batch_number = 1

    for file in files:
        file_path = os.path.join(source_folder, file)
        file_size = os.path.getsize(file_path)
        
        if current_size + file_size > max_size:
            # Если добавление файла превысит лимит, сохраняем текущую партию файлов
            target_folder = os.path.join(source_folder, f'batch_{batch_number}')
            os.makedirs(target_folder, exist_ok=True)
            for f in current_batch:
                shutil.move(os.path.join(source_folder, f), os.path.join(target_folder, f))
            # Сброс состояния для новой партии
            current_batch = []
            current_size = 0
            batch_number += 1

        # Добавляем файл в текущую партию
        current_batch.append(file)
        current_size += file_size

    # Перемещаем оставшиеся файлы последней партии
    if current_batch:
        target_folder = os.path.join(source_folder, f'batch_{batch_number}')
        os.makedirs(target_folder, exist_ok=True)
        for f in current_batch:
            shutil.move(os.path.join(source_folder, f), os.path.join(target_folder, f))

if __name__ == "__main__":
    source_folder = r'D:\audiocraft_training\dataset_fragments'  # Замените на путь к вашей папке
    organize_files_by_size(source_folder)
