In [None]:
# read upgrade_dataset.json
# take url and donload with pytube library, save it into a music .wav file in data/genres_original/{genre}/{genre}.upgrade.{index}.wav, shorten t to a multiple of 30 seconds
# my json is like this {"blues": [{"title": "The Thrill Is Gone - B.B. King", "url": "https://www.youtube.com/watch?v=4fk2prKnYnI"}, ... ], ...}

In [None]:
%pip install pytube pydub
%cd /tf/notebooks/notebooks

In [None]:
import json
import os
from pytube import YouTube
from pydub import AudioSegment

# Load data from the JSON file
with open('upgrade_dataset.json', 'r') as file:
    data = json.load(file)

# Define a function to download and process the audio
def download_and_process_audio(genre, title, url, index):
    try:
        yt = YouTube(url)
        stream = yt.streams.filter(only_audio=True).first()
        
        # if mp4 is not already downloaded, do it, otherwise skip
        filename = f"data/genres_original/{genre}/{genre}.upgrade.{index}.wav"

        if not os.path.exists(filename):
            # Download the video as audio
            stream.download(filename=f"data/temp/{genre}_{index}.mp4")
            
            # Convert the downloaded file to a WAV audio file
            audio = AudioSegment.from_file(f"data/temp/{genre}_{index}.mp4", format="mp4")
            
            # Ensure duration is a multiple of 30 seconds
            duration = len(audio)
            duration = duration - (duration % (30 * 1000))  # Round down to the nearest multiple of 30 seconds
            audio = audio[:duration]  # Trim audio to the desired duration
            
            # Export the audio as a WAV file
            audio.export(filename, format="wav")
            
            print(f"Downloaded and processed: {title}")
        else:
            print(f"Already downloaded: {title}")
            
    except Exception as e:
        print(f"Error processing {title}: {e}")

# Process each entry in the JSON data
for genre, songs in data.items():
    genre_dir = f"data/genres_original/{genre}/"
    if not os.path.exists(genre_dir):
        os.makedirs(genre_dir)
    
    for index, song in enumerate(songs):
        title = song['title']
        url = song['url']
        download_and_process_audio(genre, title, url, index)
