In [1]:
import torch
from IPython.display import Audio

from src.data.lid import infer_lid_distribution


# waveform: torch.Tensor (1D or 2D), sample_rate: int
def play_audio(waveform, sample_rate):
    return Audio(waveform.numpy(), rate=sample_rate)



In [2]:
import os
import platform
import subprocess

def open_file_with_default_app(file_path):
    system = platform.system()

    if system == 'Darwin':  # macOS
        subprocess.run(['open', file_path])
    elif system == 'Windows':
        os.startfile(file_path)
    elif system == 'Linux':
        subprocess.run(['xdg-open', file_path])
    else:
        raise OSError(f"Unsupported operating system: {system}")

# VoxPopuli


In [3]:
import os, glob, random, torchaudio

root_path = "/Volumes/LKieuData/VoxPopuli/unlabelled_data/en/"
paths = glob.glob(os.path.join(root_path, '**', '*.ogg'), recursive=True)

In [4]:
sampled_file = random.choice(paths)
print("Sampled:", sampled_file)
open_file_with_default_app(sampled_file)

Sampled: /Volumes/LKieuData/VoxPopuli/raw_audios/en/2013/20131125-1600-COMMITTEE-FEMM_en.ogg


In [3]:
from src.data.vad import find_speech_and_trim, find_overlapped_and_trim
import torchaudio
waveform, sr = torchaudio.load("testVADandLID.m4a")
waveform = find_speech_and_trim(waveform, sr)
play_audio(waveform, sr)

# AudioSet

In [29]:
import json
import os
import glob
import random
import torchaudio

drive = "/media/larry/55b84e27-f8a5-4823-9b85-197fc1c6075f/"
processed = drive + '/AudioSet/100processed4/'
ref = drive + '/AudioSet/extracted/'
stat = processed + '/stat.json'
with open(stat, "r") as f:
    stat = json.load(f)

paths = glob.glob(os.path.join(processed, '**', '*.flac'), recursive=True)
processed_names = [path.split('/')[-1] for path in paths]


In [40]:
sampled_file = random.choice(stat['not_english'])
# sampled_path = os.path.join(processed, sampled_file)
ref_path = os.path.join(ref, sampled_file)
print("Sampled:", sampled_file)


Sampled: 21toNK8Mv3w.flac


In [55]:
sampled_wave, ssr = torchaudio.load(sampled_path)
play_audio(sampled_wave, ssr)

In [41]:
ref_wave, rsr = torchaudio.load(ref_path)
play_audio(ref_wave, rsr)

# LID Test

In [44]:
def resample(waveform, sr, target_sr):
    return torchaudio.transforms.Resample(sr, target_sr)(waveform)

In [82]:
import torchaudio

# test_flac = '/Users/lkieu/Desktop/Audioset/processed/-5-vmt2iKT0.flac'
test_flac = 'testVADandLID.m4a'
waveform, sr = torchaudio.load(test_flac)
waveform = resample(waveform, sr, 16_000)
# waveform = waveform[:, 16000 * 3:]

In [56]:
from src.data.vad import find_speech_and_trim
waveform = find_speech_and_trim(waveform, 16000)

In [57]:
play_audio(waveform, 16000)

In [68]:
vie = waveform[:, :34000]
play_audio(vie, 16000)

In [71]:
import torch
eng = waveform[:, 34000:]
reversed = torch.cat((eng, vie), dim=-1)
play_audio(reversed, 16000)

In [75]:
from src.data.lid import infer_lid_distribution
# vie eng
print(infer_lid_distribution(waveform.squeeze(), 16000)['eng'])
# eng vie
print(infer_lid_distribution(reversed.squeeze(), 16000)['eng'])

0.03704975172877312
0.4961805045604706


In [87]:
from src.data.lid import trim_non_target_language
lang_trimmed = trim_non_target_language(waveform.squeeze(), 16000, 'eng', window_sec=2, chunk_threshold=0.3)

In [88]:
play_audio(lang_trimmed[0], 16000)