In [None]:
# Audioset downloader
# Use youtube-dl to download, keep track of ones downloaded

import csv
import subprocess
import os
import datetime

# Input CSV file with video details
input_file = 'audioset/balanced_train_segments.csv'
# Output CSV file for logging failures
failures_file = 'audioset/balanced_train_segments_FAILED.csv'

# Create a directory to store audio files
audio_directory = 'audioset/data/download-test'
os.makedirs(audio_directory, exist_ok=True)

# Open the input file and failure file for writing
with open(input_file, 'r') as csvfile, open(failures_file, 'w', newline='') as failurefile:
    reader = csv.reader(csvfile)
    failure_writer = csv.writer(failurefile)

    # Write the header for the failure file
    failure_writer.writerow(['YTID', 'error'])

    # Skip the header rows
    next(reader, None)
    next(reader, None)
    next(reader, None)

    for row in reader:
        ytid = row[0]
        start_seconds = row[1]
        duration = float(row[2]) - float(row[1])
        positive_labels = ','.join(row[3:])

        video_url = f'https://www.youtube.com/watch?v={ytid}'
        output_audio_file = os.path.join(audio_directory, f'{ytid}.wav')

        # Get the URL for the best audio stream
        command_to_get_url = f"youtube-dl -g -f bestaudio {video_url}"
        audio_url = subprocess.getoutput(command_to_get_url).strip()

        if not audio_url:
            failure_writer.writerow([ytid, 'URL retrieval failure'])
            print(f'Failed to retrieve URL for video {ytid}')
            continue

        # Download and convert the audio segment using ffmpeg
        command_to_download_convert = f"ffmpeg -ss {start_seconds} -i \"{audio_url}\" -t {duration} -acodec pcm_s16le -ar 44100 {output_audio_file}"
        download_result = os.system(command_to_download_convert)

        if download_result != 0:
            failure_writer.writerow([ytid, 'Download or conversion failure'])
            print(f'Failed to process video {ytid}')
            continue

        print(f'Successfully processed video {ytid}')

In [7]:
import csv
import subprocess
import os
import pandas as pd

# Input CSV file with video details
input_file = 'audiocaps/dataset/train.csv'
# Output CSV file for metadata
success_file = 'audiocaps/dataset/train_download_success.csv'
# Output CSV file for logging failures
failures_file = 'audiocaps/dataset/train_download_fail.csv'

# Create a directory to store audio files
audio_directory = 'audiocaps/dataset/train'
os.makedirs(audio_directory, exist_ok=True)

# only download 10%
total_rows = len(pd.read_csv(input_file))
target = total_rows / 10
n_downloaded = 3311

# Read the success and failure files to get the list of previously processed ytids
processed_ytids = set()

# If success file exists, read it
if os.path.exists(success_file):
    with open(success_file, 'r') as successfile:
        reader = csv.reader(successfile)
        next(reader, None)  # Skip the header
        for row in reader:
            processed_ytids.add(row[1])

# If failure file exists, read it
if os.path.exists(failures_file):
    with open(failures_file, 'r') as failurefile:
        reader = csv.reader(failurefile)
        next(reader, None)  # Skip the header
        for row in reader:
            processed_ytids.add(row[1])

# Open the input file and failure file for writing
with open(input_file, 'r') as csvfile, open(failures_file, 'a', newline='') as failurefile, open(success_file, 'a', newline='') as successfile:
    reader = csv.reader(csvfile)
    failure_writer = csv.writer(failurefile)
    success_writer = csv.writer(successfile)

    # Skip the header rows
    next(reader, None)

    for row in reader:
        audiocap_id = row[0]
        ytid = row[1]

        # If this ytid has already been processed, skip to the next row
        if ytid in processed_ytids:
            print(f'Skipping already processed video {ytid}')
            continue

        start_seconds = row[2]
        duration = "00:00:10"
        caption = row[3]

        video_url = f'https://www.youtube.com/watch?v={ytid}'
        output_audio_file = os.path.join(audio_directory, f'{ytid}.wav')

        # Get the URL for the best audio stream
        command_to_get_url = f"youtube-dl -g -f bestaudio {video_url}"
        audio_url = subprocess.getoutput(command_to_get_url).strip()

        if not audio_url:
            failure_writer.writerow([audiocap_id, ytid, 'URL retrieval failure'])
            print(f'Failed to retrieve URL for video {ytid}')
            continue

        # Download and convert the audio segment using ffmpeg
        command_to_download_convert = f"ffmpeg -ss {start_seconds} -i \"{audio_url}\" -t {duration} -acodec pcm_s16le -ar 44100 {output_audio_file}"
        download_result = os.system(command_to_download_convert)

        if download_result != 0:
            failure_writer.writerow([audiocap_id, ytid, 'Download or conversion failure'])  # Fixed this line
            print(f'Failed to process video {ytid}')
            continue

        print(f'Successfully processed video {ytid}')
        success_writer.writerow([audiocap_id, ytid, caption])

        n_downloaded += 1  # Fixed the addition assignment
        if n_downloaded == target:
            print(f'{n_downloaded} files downloaded')
            break


4875
Duplicates in column: 0       103549
1       103548
2       103541
3       103540
4       103542
         ...  
4870    103094
4871    103093
4872    103092
4873    103091
4874    103090
Name: audiocap_id, Length: 4875, dtype: int64
Series([], Name: count, dtype: int64)


In [None]:
import os

path = 'audiocaps/dataset/train/'
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
number_of_files = len(files)

print(f"There are {number_of_files} files in the train folder.")

path = 'audiocaps/dataset/test/'
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
number_of_files = len(files)

print(f"There are {number_of_files} files in the test folder.")

path = 'audiocaps/dataset/val/'
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
number_of_files = len(files)

print(f"There are {number_of_files} files in the val folder.")


: 