In [1]:
import torch, torchaudio

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cuda = torch.cuda.is_available()

print(device)
print(cuda)

cuda
True


# Extract Embeddings

In [3]:
import torch, torchaudio

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cuda = torch.cuda.is_available()


In [5]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model


In [6]:
model_name = "facebook/wav2vec2-xls-r-300m"
model = Wav2Vec2Model.from_pretrained(model_name)

Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2Model: ['project_q.weight', 'quantizer.weight_proj.weight', 'quantizer.codevectors', 'project_q.bias', 'project_hid.bias', 'project_hid.weight', 'quantizer.weight_proj.bias']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
model = model.to(device)

In [15]:
filepath = "/home/fred/Projetos/DATASETS/MOS/BRSPEECH_MOS_DATASET/data/ground_truth/dataset_oficial_copel-0592.wav"

audio_data, sr = torchaudio.load(filepath)
audio_data = audio_data.to(device)

In [16]:
output = model(audio_data).last_hidden_state

In [17]:
output.shape

torch.Size([1, 392, 1024])

In [8]:
from tqdm import tqdm
import os

metadata_filepath = '/home/fred/Projetos/DATASETS/MOS/VCC2018_MOS_preprocessed/mos_list.txt'
wavs_filepath = '/home/fred/Projetos/DATASETS/MOS/VCC2018_MOS_preprocessed/wav'
with open(metadata_filepath, encoding="utf-8") as f:
  content_file = f.readlines()

output_dir = "/home/fred/Projetos/MOS/custom_mos_prediction/wav2vec_raw_embeddings/wav2vec2-xls-r/VCC2018_embeddings_wav2vec2-xls-r-300m"
os.makedirs(output_dir, exist_ok=True)

for line in tqdm(content_file):
    #filepath, mos, condition, database = line.split(',')
    filepath, mos = line.split(',')
    filename = os.path.basename(filepath)
    filepath = os.path.join(wavs_filepath, filepath)
    if not os.path.exists(filepath):
      continue
    audio_data, sr = torchaudio.load(filepath)    
    audio_data = audio_data.to(device)

    #audio_data = processor(audio_data, return_tensors="pt", padding=True, sampling_rate=16000).input_values

    # Extract Embedding    
    file_embedding = model(audio_data).last_hidden_state
    
    # Saving embedding
    output_filename = filename.split(".")[0] + ".pt"
    output_filepath = os.path.join(output_dir, output_filename)

    torch.save(file_embedding, output_filepath)  

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20580/20580 [10:59<00:00, 31.21it/s]


In [14]:
model = model.to(device)

# Extract Embeddings from BRSpeech

In [16]:
metadata_filepath = '/home/fred/Projetos/DATASETS/MOS/BRSPEECH_MOS_DATASET/metadata.csv'
wavs_filepath = "/home/fred/Projetos/DATASETS/MOS/BRSPEECH_MOS_DATASET/"
with open(metadata_filepath, encoding="utf-8") as f:
  content_file = f.readlines()

output_dir = "./brspeech_mos_wav2vec_embeddings"

for line in tqdm(content_file):
    filepath, score, condition, database = line.split(",")
    filename = os.path.basename(filepath)
    complete_filepath = os.path.join(wavs_filepath, filepath)
    if not os.path.exists(complete_filepath):
      continue
    audio_data, sr = torchaudio.load(complete_filepath)
    audio_data = audio_data.to(device)
    # Extract Embedding    
    file_embedding = model(audio_data)#['last_hidden_state']
    
    # Saving embedding
    output_filename = filename.split(".")[0] + ".pt"
    output_filepath = os.path.join(output_dir, os.path.dirname(filepath), output_filename)

    os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
    torch.save(file_embedding, output_filepath)  

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2428/2428 [00:23<00:00, 104.29it/s]


# DataLoader Tests

In [17]:
from torch.utils.data import Dataset, DataLoader

class EmbeddingsDataset(Dataset):
    def __init__(self, filepaths: list, scores: list):
        self.filepaths = filepaths
        self.scores = scores

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, idx):
        filename = self.filepaths[idx]
        embedding = torch.load(filename).transpose(2,1)
        embedding = embedding.mean(axis=-1)
        score = self.scores[idx]

        return {"data": embedding, "score": score}

In [33]:
import torch
torch.cuda.is_available()

True

In [38]:
import torch
def embedding_collate_fn(data):
    """
       data: is a list of tuples with (example, label, length)
             where 'example' is a tensor of arbitrary shape
             and label/length are scalars
    """
    features = [torch.tensor(d['data']) for d in data] #(3)
    scores = torch.tensor([d['score']  for d in data])
    new_features = pad_sequence([f.T for f in features], batch_first=True).squeeze()

    return  {
        'data': new_features,
        'score': scores
    }

In [53]:
import pandas as pd
from os.path import join

class EmbeddingsDataloader(DataLoader):
    def __init__(self, data_dir, metadata_file, emb_dir, batch_size, shuffle=False, validation_split=0.1, num_workers=1, training=True):
        self.batch_size = batch_size
        self.shuffle = shuffle
        train_data = pd.read_csv(join(data_dir, metadata_file))
        train_data['score'] = train_data['score'] / train_data['score'].max()
        scores = train_data['score'].to_list()
        train_data['filepath'] = str(data_dir + "/" + emb_dir + "/") + train_data['filepath'] + ".pt"
        filepaths = train_data['filepath'].to_list()

        self.dataset = EmbeddingsDataset(filepaths, scores)
        #super().__init__(dataset=self.dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=embedding_collate_fn)
        super().__init__(dataset=self.dataset, batch_size=batch_size)


In [40]:
data_dir="/home/fred/Projetos/MOS/custom_mos_prediction/wav2vec_raw_embeddings/wav2vec2-xls-r/VCC2018_embeddings_wav2vec2-xls-r-300m/"
metadata_file="train.csv"
emb_dir="data"
batch_size=10

In [43]:
train_data = pd.read_csv(join(data_dir, metadata_file))
train_data['score'] = train_data['score'] / train_data['score'].max()
scores = train_data['score'].to_list()
train_data['filepath'] = str(data_dir + "/" + emb_dir + "/") + train_data['filepath'] + ".pt"
filepaths = train_data['filepath'].to_list()

In [45]:
len(filepaths)

18522

In [46]:
len(scores)

18522

In [47]:
train_dataset = EmbeddingsDataset(filepaths, scores)

In [48]:
train_dataloader = DataLoader(train_dataset, batch_size)

In [52]:
for data in train_dataloader:
    print(data["data"])
    print(data["score"])
    break

tensor([[[ 5.2956e-02,  2.0203e-02,  6.1016e-02,  ..., -8.9338e-02,
           5.8017e-02,  5.1611e-01]],

        [[ 7.9277e-02,  2.8913e-02,  5.9120e-02,  ..., -1.0158e-01,
           4.0887e-02,  5.4508e-01]],

        [[ 7.2826e-02,  1.1293e-04,  1.0283e-01,  ..., -9.9419e-02,
           3.6471e-02,  5.5208e-01]],

        ...,

        [[ 9.2709e-02,  3.7461e-02,  6.8724e-02,  ..., -8.0536e-02,
           6.9790e-02,  5.2259e-01]],

        [[-6.1411e-03, -3.0505e-02,  5.4099e-02,  ..., -8.5849e-02,
           4.5544e-02,  5.1269e-01]],

        [[ 4.7661e-02,  3.9256e-03,  9.4667e-02,  ..., -1.0732e-01,
           5.7054e-02,  5.6737e-01]]], device='cuda:0',
       grad_fn=<StackBackward0>)
tensor([0.1000, 0.7000, 0.1500, 0.4500, 0.6500, 0.4500, 0.3500, 0.3500, 0.9500,
        0.6500], dtype=torch.float64)


In [54]:
train_dataloader = EmbeddingsDataloader(data_dir, metadata_file, emb_dir, batch_size)

In [55]:
for data in train_dataloader:
    print(data)
    break

{'data': tensor([[[ 5.2956e-02,  2.0203e-02,  6.1016e-02,  ..., -8.9338e-02,
           5.8017e-02,  5.1611e-01]],

        [[ 7.9277e-02,  2.8913e-02,  5.9120e-02,  ..., -1.0158e-01,
           4.0887e-02,  5.4508e-01]],

        [[ 7.2826e-02,  1.1293e-04,  1.0283e-01,  ..., -9.9419e-02,
           3.6471e-02,  5.5208e-01]],

        ...,

        [[ 9.2709e-02,  3.7461e-02,  6.8724e-02,  ..., -8.0536e-02,
           6.9790e-02,  5.2259e-01]],

        [[-6.1411e-03, -3.0505e-02,  5.4099e-02,  ..., -8.5849e-02,
           4.5544e-02,  5.1269e-01]],

        [[ 4.7661e-02,  3.9256e-03,  9.4667e-02,  ..., -1.0732e-01,
           5.7054e-02,  5.6737e-01]]], device='cuda:0',
       grad_fn=<StackBackward0>), 'score': tensor([0.1000, 0.7000, 0.1500, 0.4500, 0.6500, 0.4500, 0.3500, 0.3500, 0.9500,
        0.6500], dtype=torch.float64)}


In [1]:
from utils.config_parser import ConfigParser

In [2]:
from data_loader import data_loaders as module_data

In [3]:
from utils.util import read_json

In [11]:
config_data = read_json("./config.json")

In [12]:
config = ConfigParser(config_data)

In [13]:
config["name"]

'DenseModel_with_VCC2018_wav2vec2-xls-r-300m'

In [14]:
data_loader = config.init_obj('data_loader', module_data)

In [16]:
for data in iter(data_loader):
    print(data)
    break

  features = [torch.tensor(d['data']) for d in data] #(3)
  features = [torch.tensor(d['data']) for d in data] #(3)


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.