In [35]:
import fairseq
import torch
import torch.nn as nn


class AvesClassifier(nn.Module):
    def __init__(self, model_path, num_classes, embeddings_dim=768, multi_label=False):

        super().__init__()

        models, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_path])
        self.model = models[0]
        self.model.feature_extractor.requires_grad_(False)
        self.head = nn.Linear(in_features=embeddings_dim, out_features=num_classes)

        if multi_label:
            self.loss_func = nn.BCEWithLogitsLoss()
        else:
            self.loss_func = nn.CrossEntropyLoss()

    def forward(self, x, y=None):
        out = self.model.extract_features(x)[0]
        out = out.mean(dim=1)             # mean pooling
        logits = self.head(out)

        loss = None
        if y is not None:
            loss = self.loss_func(logits, y)

        return loss, logits


In [36]:
# Initialize an AVES classifier with 10 target classes
model = AvesClassifier(
    model_path='/data0/home/h21/luas6629/aves-base-bio.pt',
    num_classes=10)

# Create a 1-second random sound
waveform = torch.rand((16_000))
x = waveform.unsqueeze(0)
y = torch.tensor([0])

# Run the forward pass
loss, logits = model(x, y)

2023-03-11 17:33:05 | INFO | fairseq.tasks.hubert_pretraining | current directory is /data0/home/h21/luas6629/Thesis
2023-03-11 17:33:05 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': '/mnt/dev/hubert/data/faav150k/tsv', 'fine_tuning': False, 'labels': ['km'], 'label_dir': '/mnt/dev/hubert/data/faav150k/hblab.c200', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 0, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2023-03-11 17:33:05 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 0.0, 'encoder_layerdr

In [9]:
loss, logits = model(x, y)

In [10]:
loss

tensor(2.2761, grad_fn=<NllLossBackward0>)

In [11]:
logits

tensor([[ 0.3224,  0.1328, -0.6873,  0.5657, -0.6042, -0.0712,  0.9417,  0.7135,
         -0.2378,  0.5802]], grad_fn=<AddmmBackward0>)

In [27]:
import torchaudio

audio, sample_rate = torchaudio.load("../Thesis/data/raw/fruitbat/zip_contents/files102/120614004250711708.WAV")



In [None]:
ls ../Thesis/data/raw/fruitbat/zip_contents/files102/

In [23]:
waveform

tensor([[-3.0518e-05, -3.9673e-04,  1.5259e-04,  ...,  2.1362e-04,
         -3.0518e-05,  4.5776e-04]])

In [24]:
sample_rate

250000

In [25]:
from torchaudio.transforms import Resample

resampler = Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform)

frame_length = 2.56 * 16000  # 2.56 seconds at 16 kHz
frames = waveform.unfold(1, frame_length, frame_length).transpose(0, 1)

# Normalize the waveform
frames = frames / frames.abs().max()

RuntimeError: maximum size for tensor at dimension 1 is 1480 but size is 40960

In [28]:
# Convert the audio waveform to a spectrogram
specgram = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate)(audio)




In [29]:
# Resize the spectrogram to have a fixed shape
specgram = torch.nn.functional.interpolate(specgram.unsqueeze(0), size=(128, 128))


In [31]:
# Load the labels for the audio file
labels = torch.tensor([0,2]) # Replace with the actual label


In [47]:
x = audio
y = torch.tensor([0,2])

In [48]:
# Make a prediction using the model
loss, logits = model(x, y)

ValueError: Expected input batch_size (1) to match target batch_size (2).

In [42]:
import torch
import torch.nn as nn
import torchaudio
from torch.utils.data import Dataset, DataLoader

# Define a dataset class for the bat vocalization data
class BatVocalizationDataset(Dataset):
    def __init__(self, audio_files, labels):
        self.audio_files = audio_files
        self.labels = labels

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        audio, sample_rate = torchaudio.load(self.audio_files[idx])
        specgram = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate)(audio)
        specgram = torch.nn.functional.interpolate(specgram.unsqueeze(0), size=(128, 128))
        label = self.labels[idx]
        return specgram, label

In [45]:
# Define a modified version of the HuBERT model
class ModifiedHuBERT(nn.Module):
    def __init__(self, num_classes):
        super(ModifiedHuBERT, self).__init__()
        self.feature_extractor = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=128, kernel_size=80, stride=4),
            nn.BatchNorm1d(num_features=128),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=4, stride=2),
            nn.BatchNorm1d(num_features=256),
            nn.ReLU(),
            nn.Conv1d(in_channels=256, out_channels=512, kernel_size=4, stride=2),
            nn.BatchNorm1d(num_features=512),
            nn.ReLU(),
            nn.Conv1d(in_channels=512, out_channels=1024, kernel_size=4, stride=2),
            nn.BatchNorm1d(num_features=1024),
            nn.ReLU(),
            nn.Conv1d(in_channels=1024, out_channels=2048, kernel_size=4, stride=2),
            nn.BatchNorm1d(num_features=2048),
            nn.ReLU(),
            nn.Conv1d(in_channels=2048, out_channels=4096, kernel_size=4, stride=2),
            nn.BatchNorm1d(num_features=4096),
            nn.ReLU(),
        )
        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.fc1 = nn.Linear(in_features=4096, out_features=2048)
        self.fc2 = nn.Linear(in_features=2048, out_features=num_classes)

    def forward(self, x):
        x = self.feature_extractor(x)
        x = self.pooling(x)
        x = x.view(-1, 4096)
        x = self.fc1(x)
        x = nn.ReLU()(x)
        x = self.fc2(x)
        return x

# Load the pre-trained HuBERT model
hubert = torch.hub.load('pytorch/fairseq', 'hubert_base')

# Initialize the modified model with the pre-trained weights
model = ModifiedHuBERT(num_classes=10)
model.feature_extractor.load_state_dict(hubert['model'].state_dict())

# Freeze the pre-trained layers
for param in model.feature_extractor.parameters():
    param.requires_grad = False

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Define the training and testing datasets and data loaders
train_dataset = BatVocalizationDataset(train_audio_files, train_labels)
test_dataset = BatVocalizationDataset(test_audio_files, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Train the model
for epoch in range(10):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print('Epoch %d loss: %.3f' % (epoch + 1, running_loss / len(train_loader)))

# Evaluate the model
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print('Accuracy: %.2f%%' % (100 * correct / total))

Using cache found in /home/h21/luas6629/.cache/torch/hub/pytorch_fairseq_main


RuntimeError: Cannot find callable hubert_base in hubconf

In [46]:
loss.backward()

In [54]:
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader

In [None]:
train_data = [
    ('audio_files/', [0, 1, 1]),
    ('audio_files/2.wav', [1, 0, 1]),
    ('audio_files/3.wav', [1, 1, 0]),
    ('audio_files/4.wav', [0, 1, 0]),
    ('audio_files/5.wav', [1, 0, 0])
]

In [50]:
train_data =  [
("../Thesis/data/raw/fruitbat/zip_contents/files102/120614004250711708.WAV", [0, 1, 2]),
("../Thesis/data/raw/fruitbat/zip_contents/files102/120620055417558444.WAV", [3, 2, 0]),
("../Thesis/data/raw/fruitbat/zip_contents/files102/120628041134879996.WAV", [4, 1, 0]),
("../Thesis/data/raw/fruitbat/zip_contents/files102/120614004417421711.WAV", [3, 5, 7]),
("../Thesis/data/raw/fruitbat/zip_contents/files102/120620055724268355.WAV", [2, 4, 9])
]

In [52]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

In [59]:
x, y = train_loader

ValueError: not enough values to unpack (expected 2, got 1)

In [55]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [57]:
for epoch in range(2):
    model.train()
    for x, y in train_loader:
        optimizer.zero_grad()
        loss, logits = model(x, y)
        loss.backward()
        optimizer.step()

    # 6. Evaluate the model
    model.eval()
    with torch.no_grad():
        val_loss, val_acc = 0, 0
        for x, y in val_loader:
            loss, logits = model(x, y)
            val_loss += loss.item() * x.shape[0]
            val_acc += (logits.argmax(dim=-1) == y).sum().item()
        val_loss /= len(val_data)
        val_acc /= len(val_data)

    print(f"Epoch {epoch+1}, val_loss={val_loss:.4f}, val_acc={val_acc:.4f}")


AttributeError: 'tuple' object has no attribute 'unsqueeze'

In [63]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor


In [98]:
import torch
import torchaudio
import torch.nn as nn
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import HubertForCTC

# Load the pre-trained Hubert model and tokenizer
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")

# Freeze all the model layers except for the last layer
for param in model.parameters():
    param.requires_grad = False
model.lm_head = nn.Linear(1024, 100)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = nn.CTCLoss(blank=processor.tokenizer.pad_token_id)

# Define a function to preprocess the audio files and labels
def preprocess(audio_files, labels):
    input_values = []
    attention_masks = []
    labels_encoded = []
    for i in range(len(audio_files)):
        input, sr = torchaudio.load(audio_files[i])
        
        # resample
        transform = torchaudio.transforms.Resample(sr, 16000)
        input = transform(input)
        sr = 16000
        
        input_values.append(processor(input, sampling_rate=sr, return_tensors="pt").input_values)
        attention_masks.append(processor(input, sampling_rate=sr, return_tensors="pt").attention_mask)
        labels_encoded.append(processor.tokenizer(labels[i], padding=True, truncation=True, return_tensors="pt").input_ids)
    return input_values, attention_masks, labels_encoded

# Define the training loop
def train(model, optimizer, loss_fn, input_values, attention_masks, labels_encoded):
    model.train()
    total_loss = 0
    for i in range(len(input_values)):
        
        # unsqueeze to keep dimensions

        outputs = model(input_values[i].unsqueeze(0), attention_mask=attention_masks[i], labels=labels_encoded[i])
        loss = loss_fn(outputs.logits.permute(1, 0, 2), labels_encoded[i].squeeze(0), input_lengths=outputs.logits.shape[0]*torch.ones(outputs.logits.shape[1], dtype=torch.int32))
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return total_loss


In [87]:
audio_files = [librosa.load(audio_file, sr=250000) for audio_file in audio_files]

In [89]:
signals = [s for s, sr in audio_files]

In [90]:
resampled_signals = [librosa.resample(signal, orig_sr=250000, target_sr=16000) for signal in signals]

In [99]:
# Fine-tune the model on the 5 audio files with variable numbers of labels
audio_files = ['../Thesis/data/raw/fruitbat/zip_contents/files102/120614004250711708.WAV',
 '../Thesis/data/raw/fruitbat/zip_contents/files102/120620055417558444.WAV',
 '../Thesis/data/raw/fruitbat/zip_contents/files102/120628041134879996.WAV',
 '../Thesis/data/raw/fruitbat/zip_contents/files102/120614004417421711.WAV',
 '../Thesis/data/raw/fruitbat/zip_contents/files102/120620055724268355.WAV']

labels = [["label1", "label2"], ["label1", "label3", "label4"], ["label2", "label4"], ["label1", "label3", "label5"], ["label2"]]
input_values, attention_masks, labels_encoded = preprocess(audio_files, labels)
for epoch in range(10):
    loss = train(model, optimizer, loss_fn, input_values, attention_masks, labels_encoded)
    print("Epoch", epoch+1, "loss:", loss)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [1, 1, 1, 1, 23123]

In [None]:
# Classify the labels of a new audio file
new_audio_file = "new_audio.wav"
new_labels = ["label1", "label5"]
input_values, attention_masks, _ = preprocess([new_audio_file], [new_labels])
outputs = model(input_values[0], attention_mask=attention_masks[0])
predicted_ids = torch.argmax(outputs.logits, dim=-1)
predicted_labels = tokenizer.batch_decode(predicted_ids)[0]
print("Predicted labels:", predicted_labels)






In [80]:
audio_files = [u for u, v in train_data]
labels =  [v for u, v in train_data]

In [81]:
audio_files

['../Thesis/data/raw/fruitbat/zip_contents/files102/120614004250711708.WAV',
 '../Thesis/data/raw/fruitbat/zip_contents/files102/120620055417558444.WAV',
 '../Thesis/data/raw/fruitbat/zip_contents/files102/120628041134879996.WAV',
 '../Thesis/data/raw/fruitbat/zip_contents/files102/120614004417421711.WAV',
 '../Thesis/data/raw/fruitbat/zip_contents/files102/120620055724268355.WAV']

In [67]:
from transformers import HubertModel, HubertConfig

In [None]:
import torch
from transformers import Wav2Vec2Processor, HubertForCTC
from datasets import load_dataset
import soundfile as sf

processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")

def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
ds = ds.map(map_to_array)

input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)

transcription = processor.decode(predicted_ids[0])

# compute loss
target_transcription = "A MAN SAID TO THE UNIVERSE SIR I EXIST"

# wrap processor as target processor to encode labels
with processor.as_target_processor():
    labels = processor(target_transcription, return_tensors="pt").input_ids

loss = model(input_values, labels=labels).loss

In [85]:
import librosa
from transformers import Wav2Vec2FeatureExtractor

# Load the WAV file
audio_file = "example.wav"
signal, sample_rate = librosa.load(audio_file, sr=None)

# Initialize the feature extractor
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=False)

# Resample the audio signal to 16kHz
resampled_signal = librosa.resample(signal, orig_sr=sample_rate, target_sr=16000)

# Extract features from the resampled signal
features = feature_extractor(resampled_signal, sampling_rate=16000, return_tensors="pt")

# Print the shape of the extracted features tensor
print(features["input_values"].shape)


  signal, sample_rate = librosa.load(audio_file, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: 'example.wav'