In [1]:
import torch
import torchaudio
import numpy as np
import pandas as pd
import os
import librosa
import torchaudio.transforms as T

SAMPLE_AUDIO = '/work/dpandya/LibriVox_Kaggle/achtgesichterambiwasse/achtgesichterambiwasse_0007.wav'
SAMPLE_RATE = 16000

In [2]:
def get_samples(path, resample=SAMPLE_RATE):
    '''
    Given a path to the audio file, returns a torch.Tensor array and sampling rate
    
    Args:
    path: The path of the audio file
    resample: The resampling rate, if different than default
    '''
    audio, sr = torchaudio.load(path)
    
    if (sr==resample):
        return audio, sr
    else:
        resampler = T.Resample(sr, resample, dtype=audio.dtype)
        audio = resampler(audio)
        return audio, resample


In [3]:
def get_audio_length(audio):
    '''
    Returns the length of an audio in secs, given the sampling rate is 16000
    '''
    return len(audio[0])/SAMPLE_RATE

In [4]:
def make_audio_chunks(audio, chunk_size=1):
    '''
    This funciton splits audio in chunks of n seconds. 
    You can adjust the chunk sizes by using chunks_size param

    Args:
    audio: torch.Tensor of shape [1,n_samples]
    chunk_size: desired number of seconds in each chunk

    Returns:
    audio_chunks: returns a list of audio chunks of the predecided chunk length
    '''
    t_chunks = chunk_size*SAMPLE_RATE
    audio_chunks = []
    for i in range(0, len(audio[0]), t_chunks):
        audio_chunks.append((audio[0][i:i+t_chunks]).unsqueeze(0))
        
    return audio_chunks

In [5]:
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
                                    use_auth_token="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")


# apply the pipeline to an audio file
diarization = pipeline(SAMPLE_AUDIO)

# dump the diarization output to disk using RTTM format
with open("audio.rttm", "w") as rttm:
    diarization.write_rttm(rttm)


ModuleNotFoundError: No module named 'pyannote'

In [None]:
aud, sr = get_samples(SAMPLE_AUDIO)

ll = make_audio_chunks(aud, 2)
print(sr)
print([i for i in ll])

16000
[tensor([[-1.1451e-04,  5.2144e-05, -1.0690e-04,  ...,  1.8235e-02,
          1.9396e-02,  2.0040e-02]]), tensor([[ 0.0197,  0.0181,  0.0145,  ...,  0.0005, -0.0012, -0.0034]]), tensor([[ 0.0031,  0.0037, -0.0005,  ..., -0.0229,  0.0059,  0.0080]]), tensor([[-0.0174,  0.0030,  0.0041,  ..., -0.0371, -0.0422, -0.0455]]), tensor([[-0.0477, -0.0403, -0.0214,  ...,  0.0002,  0.0008,  0.0002]])]
