In [13]:
# Get audio files

import os
import requests
import tarfile

url = "https://huggingface.co/datasets/MLCommons/peoples_speech/resolve/main/train/clean/clean_000000.tar"

download_folder = os.path.join(os.path.dirname(os.getcwd()), "data")
os.makedirs(download_folder, exist_ok=True)
tar_path = os.path.join(download_folder, "clean_000000.tar")

response = requests.get(url, stream=True)
if response.status_code == 200:
    with open(tar_path, 'wb') as f:
        f.write(response.raw.read())
    print(f"Downloaded tar file to {tar_path}")
else:
    print(f"Failed to download file: {response.status_code}")

try:
    with tarfile.open(tar_path, "r") as tar:
        tar.extractall(path=download_folder)
    print(f"Extracted tar file to {download_folder}")
except tarfile.TarError as e:
    print(f"Error extracting tar file: {e}")

try:
    os.remove(tar_path)
    print(f"Removed tar file: {tar_path}")
except OSError as e:
    print(f"Error removing tar file: {e}")

Downloaded tar file to c:\CodeProjects\University\3.2_VU\deep learning\gmmGroup\data\clean_000000.tar
Extracted tar file to c:\CodeProjects\University\3.2_VU\deep learning\gmmGroup\data
Removed tar file: c:\CodeProjects\University\3.2_VU\deep learning\gmmGroup\data\clean_000000.tar


In [14]:
from datasets import load_dataset
import os

dataset = load_dataset("json", data_files="clean.json")

# Input arrays

textInput = [] 
audioPathInput = [] 

for data in dataset['train']['training_data']:
    textInput.extend(data['label'])
    audioPathInput.extend(data['name'])

printInputArrays = False

if printInputArrays:
    for i in range(len(textInput)):
        print(textInput[i])

    for i in range(len(audioPathInput)):
        print(audioPathInput[i])

Generating train split: 0 examples [00:00, ? examples/s]

In [16]:
from transformers import VitsModel, AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from scipy.io.wavfile import read
import soundfile as sf
from tqdm import tqdm

class CustomDataset(Dataset):
    def __init__(self, texts, audio_dir, tokenizer, sampling_rate=16000):
        self.texts = texts
        self.audio_dir = audio_dir
        self.audio_files = sorted(os.listdir(audio_dir))
        self.tokenizer = tokenizer
        self.sampling_rate = sampling_rate

        # Ensure the number of texts matches the number of audio files
        assert len(self.texts) == len(self.audio_files), f"Mismatch between number of texts ({len(self.texts)}) and audio files ({len(self.audio_files)})"

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        audio_path = os.path.join(self.audio_dir, self.audio_files[idx])
        
        # Read and resample audio
        audio, sample_rate = sf.read(audio_path)
        if sample_rate != self.sampling_rate:
            raise ValueError(f"Sample rate mismatch: {sample_rate} != {self.sampling_rate}")
        
        inputs = self.tokenizer(text, return_tensors="pt")
        
        return inputs.input_ids.squeeze(), torch.tensor(audio).float()

def train_model(model, tokenizer, dataset, device, epochs=10, batch_size=4):
    model.train()
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    loss_fn = torch.nn.MSELoss()

    for epoch in range(epochs):
        total_loss = 0
        for text_inputs, audio_targets in dataloader:
            text_inputs = text_inputs.to(device)
            audio_targets = audio_targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids=text_inputs).waveform
            loss = loss_fn(outputs.squeeze(), audio_targets)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader)}")

# Example list of texts
texts = textInput

# Define path to your audio data
audio_dir = r'C:\CodeProjects\University\3.2_VU\deep learning\gmmGroup\data'

# Initialize the tokenizer and dataset
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
dataset = CustomDataset(texts, audio_dir, tokenizer)

# Load the model
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Train the model
train_model(model, tokenizer, dataset, device, epochs=10, batch_size=4)


397
61


AssertionError: Mismatch between number of texts (397) and audio files (337)