In [1]:
import torch
import torchaudio
import numpy as np
import pandas as pd
import os
import librosa
import torchaudio.transforms as T

SAMPLE_AUDIO = '/work/dpandya/LibriVox_Kaggle/achtgesichterambiwasse/achtgesichterambiwasse_0007.wav'
SAMPLE_RATE = 16000

In [2]:
def get_samples(path, resample=SAMPLE_RATE):
    '''
    Given a path to the audio file, returns a torch.Tensor array and sampling rate
    
    Args:
    path: The path of the audio file
    resample: The resampling rate, if different than default
    '''
    audio, sr = torchaudio.load(path)
    
    if (sr==resample):
        return audio, sr
    else:
        resampler = T.Resample(sr, resample, dtype=audio.dtype)
        audio = resampler(audio)
        return audio, resample


In [3]:
from IPython.display import Audio
aud, sr = get_samples(SAMPLE_AUDIO)
print(aud.shape, sr)
torchaudio.save('test.wav', aud, sr)
display(Audio(aud.squeeze().numpy(), rate=sr))

torch.Size([1, 141120]) 16000


In [8]:
def get_audio_length(audio):
    '''
    Returns the length of an audio in secs, given the sampling rate is 16000
    '''
    return len(audio[0])/SAMPLE_RATE

In [5]:
def make_audio_chunks(audio, chunk_size=1):
    '''
    This funciton splits audio in chunks of n seconds. 
    You can adjust the chunk sizes by using chunks_size param

    Args:
    audio: torch.Tensor of shape [1,n_samples]
    chunk_size: desired number of seconds in each chunk

    Returns:
    audio_chunks: returns a list of audio chunks of the predecided chunk length
    '''
    t_chunks = chunk_size*SAMPLE_RATE
    audio_chunks = []
    for i in range(0, len(audio[0]), t_chunks):
        audio_chunks.append(audio[0][i:i+t_chunks])
        
    return audio_chunks

In [7]:
ll = make_audio_chunks(aud)
print([i.shape for i in ll])

[torch.Size([16000]), torch.Size([16000]), torch.Size([16000]), torch.Size([16000]), torch.Size([16000]), torch.Size([16000]), torch.Size([16000]), torch.Size([16000]), torch.Size([13120])]
