In [24]:
#| eval: false

from glob import glob
import os
from joblib import Parallel, delayed
from pathlib import Path
from typing import List
from pydub import AudioSegment
import numpy as np

def chunk_audio(audio: AudioSegment, increment_mills: int = 300, windows: int = 1000) -> List[AudioSegment]:
    sample_rate = audio.frame_count(ms=1000)
    seconds = audio.frame_count()/sample_rate
    def get_segment(e, window):
        return audio[e-windows:e]
    audio_segments = Parallel(n_jobs=4)(delayed(get_segment)(e, windows) for e in np.arange(windows, seconds*1000, increment_mills))
    return audio_segments

def export_audio(segment: AudioSegment, path: str):
    return segment.export(f'{path}.wav', format="wav")

def chunk_selected_audios(audios: List[str], output_path: str) -> None:
    Path(output_path).mkdir(parents=True, exist_ok=True)
    for file in audios:
        audio = AudioSegment.from_wav(file)
        for ix, segment in enumerate(chunk_audio(audio)):
            path = os.path.join(output_path, f"{Path(file).stem}_{ix}.wav")
            export_audio(segment, path)

In [42]:
import pandas as pd

test = pd.read_csv('data/panda/dataset/test.csv', index_col=0)

for group_name, df_group in test.groupby('label'):
    output_path = os.path.join('chunks', group_name)
    audios = df_group.path.values
    chunk_selected_audios(audios, output_path)
    break



14346.99s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
14347.00s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
14347.01s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
14347.02s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


In [30]:
#| eval: false

from typing import List
from collections import Counter
from transformers import AutoFeatureExtractor
from transformers import Wav2Vec2ForXVector, TrainingArguments, Trainer
from datasets import load_dataset, load_metric
import pandas as pd
from pathlib import Path
import torch
from numpy.typing import NDArray
from pydantic import BaseModel
from io import BytesIO

class AudioArray(BaseModel):
    array: List[float]

class XvectorInput(BaseModel):
    model_input: List[List[float]]

class XvectorModel(object):

    def __init__(self, model_checkpoint: str) -> None:
        self.model_checkpoint = model_checkpoint
        self.model = Wav2Vec2ForXVector.from_pretrained(self.model_checkpoint)
        self.feature_extractor = AutoFeatureExtractor.from_pretrained(self.model_checkpoint)
    
    def preprocess_function(self, audio_arrays: List[AudioArray], max_duration: float = 1.0)\
         -> List[List[float]]:
        arrays = [a.array for a in audio_arrays]
        inputs = self.feature_extractor(
            arrays,
            sampling_rate=self.feature_extractor.sampling_rate, 
            max_length=int(self.feature_extractor.sampling_rate * max_duration),
            truncation=True
            )
        return inputs
    
    def parse_audio(self, raw_data: bytes, sample_width: int, channels: int, frame_rate: int) -> List[AudioArray]:
        audio = AudioSegment.from_raw(
            BytesIO(raw_data), 
            sample_width=sample_width,
            channels=channels,
            frame_rate=frame_rate
        )
        result = []
        for ix, segment in enumerate(chunk_audio(audio)):
            result.append(segment)
        return result

    def encode_data(self, audio_arrays: List[AudioArray]) -> XvectorInput:
        result = {'input_values': self.preprocess_function(audio_arrays)}
        return XvectorInput.parse_obj(result)
    
    def get_logits(self, inputs: XvectorInput):
        with torch.no_grad():
            result = self.model(**inputs.dict())
        return result
    
    def get_predicted_labels(self, logits):
        proj = self.model.objective._parameters['weight'].cpu().detach().numpy()
        return np.argmax(np.dot(logits, proj), axis=1)

model_checkpoint = 'data/panda/wav2vec2-base-finetuned-xvector/best_checkpoint/'
# xvector_model = XvectorModel(model_checkpoint)

file = '/home/jovyan/.cache/panda/audios/1655690608-SIP-A90CCE12F2CF-000041c0-chunk4.wav_3.wav'
with open(file, "rb") as f:
    audio_bytes = bytearray()
    while (byte := f.read(1)):
        audio_bytes.append(byte)
    xvector_model.parse_audio(audio_bytes, 2, 1, 16000)
    

SyntaxError: invalid syntax (3449389385.py, line 69)

In [None]:
Union[Dict]

In [44]:
df_group.shape

(336, 3)

In [29]:
#| eval: false

audios_path = f'{Path.home()}/.cache/panda/audios'
audios = glob(f'{audios_path}/*.wav')
chunk_selected_audios(audios[:5], 'test/')

'sadadsa/sdad/'