In [1]:
import numpy as np 
import torch
import torchaudio
from hw_tts.model.baseline_model import BaselineModel
from hw_tts.loss.SDRLoss import SpExPlussLoss
from pathlib import Path
from torch.utils.data import ConcatDataset, DataLoader

from torch.optim import Adam
from hw_tts.datasets import CustomDirAudioDataset

SEED = 123
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(SEED)

kenlm python bindings are not installed. Most likely you want to install it using: pip install https://github.com/kpu/kenlm/archive/master.zip
kenlm python bindings are not installed. Most likely you want to install it using: pip install https://github.com/kpu/kenlm/archive/master.zip


In [2]:
class CustomDirAudioDataset:
    def __init__(self, dir, *args, **kwargs):
        self.speaker_lst = []
        data = []
        for audio_file in Path(dir).rglob("*-target.[mwflac4]*"):
            base_key = audio_file.stem[:-len("-target")]
            suffix = audio_file.suffix
            mixed_file = audio_file.parent / f"{base_key}-mixed{suffix}"
            ref_file = audio_file.parent / f"{base_key}-ref{suffix}"
            if mixed_file.exists() and ref_file.exists():
                speaker_target, speaker_noise, *_ = base_key.split("_")
                data.append({
                    "speaker_target": speaker_target,
                    "speaker_noise": speaker_noise,
                    "target_path": str(audio_file),
                    "mix_path": str(mixed_file),
                    "reference_path": str(ref_file)
                })
        self.data = data
        self._index = self.data

    def __getitem__(self, ind):
        data_dict = self._index[ind]
        target_audio_wave, _ = torchaudio.load(data_dict["target_path"])
        mix_audio_wave, _ = torchaudio.load(data_dict["mix_path"])
        reference_audio_wave, _ = torchaudio.load(data_dict["reference_path"])
        if data_dict['speaker_target'] not in self.speaker_lst:
            self.speaker_lst.append(data_dict['speaker_target'])
        speaker_target = self.speaker_lst.index(data_dict['speaker_target'])
        return {
            "speaker_target": speaker_target,
            "target_audio": target_audio_wave,
            "reference_audio": reference_audio_wave,
            "mix_audio": mix_audio_wave
        }
    
    def __len__(self):
        return len(self._index)

In [5]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(dataset_items: list[dict]) -> dict:
    result_batch = dict()
    for key in dataset_items[0].keys():
        match key:
            case "target_audio" | "mix_audio" | "reference_audio":
                result_batch[f"{key}_len"] = torch.tensor(
                    data=[item.get(key).shape[-1] for item in dataset_items]
                ).cuda()
                result_batch[key] = pad_sequence(
                    sequences=[
                        torch.squeeze(item.get(key), dim=0).t()
                        for item in dataset_items
                    ],
                    batch_first=True,
                ).cuda()
            case _:
                result_batch[key] = [item.get(key) for item in dataset_items]
    result_batch['speaker_target'] = torch.tensor(result_batch['speaker_target']).cuda()
    return result_batch

In [6]:
dataset = CustomDirAudioDataset('/home/vladimir/PycharmProjects/TTS/temp_datasets/train_easy')

In [7]:
dataloader = DataLoader(dataset, 4, collate_fn=collate_fn)

In [8]:
model = BaselineModel(num_spks=4).cuda()
criterion = SpExPlussLoss()
optimizer = Adam(model.parameters(), lr=0.01)


In [9]:
model = torch.load('model_best.pt')

In [8]:
optimizer = Adam(model.parameters(), lr=0.001)
criterion = SpExPlussLoss()

In [15]:
from tqdm.auto import tqdm
pbar = tqdm(total = 100 * len(dataloader))
for epoch in range(1):
    loss = 0 
    for batch in dataloader: 
        optimizer.zero_grad()
        update = model(**batch)
        batch.update(update)
        res = criterion(**batch)
        res['loss'].backward()
        optimizer.step()
        pbar.update(1)
        loss += res['loss'].detach().item()
    pbar.set_description(desc=f"Loss: {loss}")
    torch.save(model, 'best_model.pt')

  0%|          | 0/18100 [00:00<?, ?it/s]

In [16]:
batchez = dataloader.__iter__()
batch = batchez.__next__()

In [17]:
batch = batchez.__next__()

In [18]:
batch.update(model(**batch))

In [26]:
import IPython

IPython.display.Audio(batch['ests'][2].detach().cpu(), rate=16_000)

In [27]:
IPython.display.Audio(batch['target_audio'][2].detach().cpu(), rate=16_000)

In [29]:
IPython.display.Audio(batch['mix_audio'][2].detach().cpu(), rate=16_000)

In [59]:
from encoder.inference import preprocess_wav
audio_input, sample_rate = sf.read("/home/vladimir/what_are_you_doint.wav")

In [57]:
import torch
from encoder.model import SpeakerEncoder
embedder = SpeakerEncoder(torch.device("cpu"),  torch.device("cpu"))
embedder.load_state_dict(torch.load('/home/vladimir/PycharmProjects/TTS/pretrained/encoder/saved_models/pretrained.pt')['model_state'], strict=False)

<All keys matched successfully>

In [67]:
embedder.forward(preprocess_wav("/home/vladimir/what_are_you_doint.wav", source_sr=16_000))

TypeError: resample() takes 1 positional argument but 3 were given

In [38]:
from speechbrain.pretrained import WaveformEncoder

ssl_model = WaveformEncoder.from_hparams(source="speechbrain/ssl-wav2vec2-base-librispeech", savedir="speechbrain/ssl-wav2vec2-base-librispeech")


FileNotFoundError: [Errno 2] No such file or directory: '/home/vladimir/PycharmProjects/TTS/speechbrain/ssl-wav2vec2-base-librispeech/hyperparams.yaml'

In [46]:
from transformers import Wav2Vec2Config, Wav2Vec2Model

# Initializing a Wav2Vec2 facebook/wav2vec2-base-960h style configuration

configuration = Wav2Vec2Config()

# Initializing a model (with random weights) from the facebook/wav2vec2-base-960h style configuration

model = Wav2Vec2Model(configuration)

# Accessing the model configuration

configuration = model.config

In [48]:
model(torch.randn(16, 32))

RuntimeError: Calculated padded input size per channel: (2). Kernel size: (3). Kernel size can't be greater than actual input size

In [52]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import soundfile as sf

# Загрузите процессор и модель
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# Загрузите ваш аудиофайл
audio_input, sample_rate = sf.read("/home/vladimir/what_are_you_doint.wav")

# Обработайте аудиосигнал и извлеките эмбеддинги
input_values = processor(audio_input, return_tensors="pt").input_values
hidden_states = model(input_values).last_hidden_state

Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RuntimeError: Calculated padded input size per channel: (2). Kernel size: (10). Kernel size can't be greater than actual input size