In [None]:
import os
import zipfile
import tarfile
import torch
from tqdm import tqdm
from torchaudio.io import StreamReader
import torchaudio
import time
from IPython.display import Audio, display

In [None]:
def process_audio_chunks(
    file_name,
    file_stream,
    chunk_size,
    target_sample_rate
):
    streamer = StreamReader(file_stream)
    # metadata = streamer.get_src_stream_info(0)

    streamer.add_basic_audio_stream(
        frames_per_chunk=int(chunk_size*target_sample_rate),
        sample_rate=target_sample_rate,
        decoder_option={"threads": "0"}
    )

    for idx, (chunk,) in enumerate(streamer.stream()):
        assert chunk.shape[-1] == 1, f"Audio needs to be mono, provided {chunk.shape[-1]} channels for {file_name}"

        start_idx = idx * chunk_size
        end_idx = start_idx + chunk_size
        base, ext = os.path.splitext(file_name)
        updated_file_name = f"{base}__{start_idx}_{end_idx}{ext}"

    return idx


def iterate_tar(x, model_sample_rate: int):
    with tarfile.open(x, 'r') as tar:
        start_time = time.time()
        idx = 0

        pbar = tqdm(position=0, leave=True)

        for member in tar.getmembers():#, total=total):
            # if idx == 0:
                # print(f'starting 1st iter took, {time.time() - start_time}s')

            if not member.isfile():
                continue

            file_content = tar.extractfile(member)
            file_name = member.name

            # if idx !=0:
                # print(f'Extracting file took: {time.time() - start_time}s')

            if file_content is None:
                continue

            total_chunks = process_audio_chunks(
                file_name=file_name,
                file_stream=file_content,
                target_sample_rate=model_sample_rate,
                chunk_size=30
            )

            pbar.n += total_chunks
            pbar.refresh()

            start_time = time.time()
            idx += 1

In [None]:
tar_file = '/home/romit/Downloads/audio/youtube/P0001.tar'

In [None]:
idx = 0
total_data = []

for fn, data in iterate_tar(tar_file, 16000):
    # print(fn, data.shape)
    # total_data.append(data)
    idx += 1

    # if idx == 1000:
    #     break

In [None]:
display(Audio(total_data[0], rate=16000))

In [None]:
%%time
with tarfile.open('/home/romit/Downloads/audio/large_test/P0011.tar', 'r:') as tar:
    t1 = time.perf_counter(), time.process_time()
    all_names = tar.getmembers()
    t2 = time.perf_counter(), time.process_time()

In [None]:
%%time
with tarfile.open('/home/romit/Downloads/audio/P0144.tgz', 'r:gz') as tar:
    t1 = time.perf_counter(), time.process_time()
    all_names = tar.getmembers()
    t2 = time.perf_counter(), time.process_time()

In [None]:
print(f" Real time: {t2[0] - t1[0]:.2f} seconds")
print(f" CPU time: {t2[1] - t1[1]:.2f} seconds")

In [None]:
import os
import tarfile
import shutil
from tqdm import tqdm

def convert_tgz_to_tar(tgz_file_path, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    base_name = os.path.basename(tgz_file_path).rsplit('.', 1)[0]
    
    temp_dir = os.path.join(output_dir, base_name)
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    
    try:
        # Uncompress tgz
        with tarfile.open(tgz_file_path, 'r:gz') as tar:
            tar.extractall(path=temp_dir)

        # Compress to tar
        tar_file_path = os.path.join(output_dir, base_name + '.tar')
        with tarfile.open(tar_file_path, 'w') as tar:
            tar.add(temp_dir, arcname=os.path.basename(temp_dir))
        
        print(f"Converted {tgz_file_path} to {tar_file_path}")

    except Exception as err:
        print(f'Error: {err} at {tgz_file_path}')

    finally:
        # Clean up the temporary directory
        shutil.rmtree(temp_dir)

In [None]:
tgz = [
    '/home/romit/Downloads/audio/large_test/P0011.tgz',
    '/home/romit/Downloads/audio/large_test/P0012.tgz'
]

In [None]:
for t in tqdm(tgz):
    convert_tgz_to_tar(t, '/home/romit/Downloads/audio/large_test')

In [None]:
5 s
50