In [1]:
import sys
sys.path.append("..")
import torch
from torch.utils.data import DataLoader
from transformers import AutoFeatureExtractor, ASTForAudioClassification
from settings import ALL_CLASSES

In [4]:
dataset, labels = torch.load("C:/Users/Filip/Desktop/PW/2 semestr/Deep Learning/pro2/MINI_DL_RNN/src/artifacts/speech-waveform-v0/validation.pt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
dataset.shape

device(type='cuda')

In [11]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from sklearn.metrics import confusion_matrix, classification_report
from tqdm import tqdm
from torch.cuda.amp import GradScaler, autocast

class SpeechClassifier:
    def __init__(self, model_name='facebook/wav2vec2-base-960h', device=None, lr=0.001):
        self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
        self.model = Wav2Vec2ForCTC.from_pretrained(model_name).to(self.device)
        self.loss_function = torch.nn.CrossEntropyLoss()  # Using CrossEntropyLoss
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        self.scaler = GradScaler()

    def load_data(self, train_path, val_path, batch_size=4):
        train_dataset, train_labels = torch.load(train_path)
        val_dataset, val_labels = torch.load(val_path)
        self.train_loader = DataLoader(TensorDataset(train_dataset, train_labels.squeeze(1)), batch_size=batch_size, shuffle=True)
        self.val_loader = DataLoader(TensorDataset(val_dataset, val_labels.squeeze(1)), batch_size=batch_size, shuffle=False)

    def train_and_validate(self, epochs, accumulation_steps=4):
        for epoch in range(epochs):
            self.model.train()
            total_train_loss = 0
            self.optimizer.zero_grad()
            with tqdm(total=len(self.train_loader), desc=f"Epoch {epoch+1}/{epochs}", unit="batch") as pbar:
                for step, (inputs, labels) in enumerate(self.train_loader):
                    processed = self.processor(inputs.numpy(), return_tensors="pt", sampling_rate=16000)
                    inputs = processed.input_values.to(self.device)
                    labels = labels.to(self.device)

                    with autocast():
                        outputs = self.model(inputs).logits
                        # Taking the mean across the time dimension to collapse the logits to expected format for CrossEntropy
                        outputs = outputs.mean(dim=1)
                        loss = self.loss_function(outputs, labels) / accumulation_steps

                    self.scaler.scale(loss).backward()
                    if (step + 1) % accumulation_steps == 0:
                        self.scaler.step(self.optimizer)
                        self.scaler.update()
                        self.optimizer.zero_grad()

                    total_train_loss += loss.item() * accumulation_steps
                    pbar.update(1)

            print(f"Average Training Loss: {total_train_loss / len(self.train_loader)}")
            self.validate()

    def validate(self):
        self.model.eval()
        total_val_loss = 0
        all_preds, all_labels = [], []
        with tqdm(total=len(self.val_loader), desc="Validating", unit="batch") as pbar:
            for inputs, labels in self.val_loader:
                processed = self.processor(inputs.numpy(), return_tensors="pt", sampling_rate=16000)
                inputs = processed.input_values.to(self.device)
                labels = labels.to(self.device)

                with torch.no_grad(), autocast():
                    outputs = self.model(inputs).logits
                    # Collapsing the logits to match the label format
                    outputs = outputs.mean(dim=1)
                    loss = self.loss_function(outputs, labels)
                    total_val_loss += loss.item()

                    preds = torch.argmax(outputs, dim=1)
                    all_preds.extend(preds.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())

                pbar.update(1)

        print(f"Validation Loss: {total_val_loss / len(self.val_loader)}")
        self.evaluate_performance(all_labels, all_preds)

    def evaluate_performance(self, labels, preds):
        print("Confusion Matrix:")
        print(confusion_matrix(labels, preds))
        print("\nClassification Report:")
        print(classification_report(labels, preds))

# Example usage:
train_path = "C:/Users/Filip/Desktop/PW/2 semestr/Deep Learning/pro2/MINI_DL_RNN/src/artifacts/speech-waveform-v0/train.pt"
val_path = "C:/Users/Filip/Desktop/PW/2 semestr/Deep Learning/pro2/MINI_DL_RNN/src/artifacts/speech-waveform-v0/validation.pt"


In [12]:
classifier = SpeechClassifier()
classifier.load_data(train_path, val_path)
classifier.train_and_validate(epochs=10)

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

Average Training Loss: 1.6420255718600114


Validating: 100%|██████████| 1676/1676 [00:29<00:00, 56.01batch/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 1.5976120395705922
Confusion Matrix:
[[   0    0    0    0    0    0    0    0    0    0    0  261]
 [   0    0    0    0    0    0    0    0    0    0    0  245]
 [   0    0    0    0    0    0    0    0    0    0    0  260]
 [   0    0    0    0    0    0    0    0    0    0    0  264]
 [   0    0    0    0    0    0    0    0    0    0    0  247]
 [   0    0    0    0    0    0    0    0    0    0    0  256]
 [   0    0    0    0    0    0    0    0    0    0    0  257]
 [   0    0    0    0    0    0    0    0    0    0    0  256]
 [   0    0    0    0    0    0    0    0    0    0    0  246]
 [   0    0    0    0    0    0    0    0    0    0    0  260]
 [   0    0    0    0    0    0    0    0    0    0    0   39]
 [   0    0    0    0    0    0    0    0    0    0    0 4113]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       261
           1       0.00      0.00      0.00       245
  

Epoch 2/10: 100%|██████████| 12570/12570 [13:07<00:00, 15.96batch/s]


Average Training Loss: 1.560359749532258


Validating: 100%|██████████| 1676/1676 [00:30<00:00, 54.55batch/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 1.5803056958182615
Confusion Matrix:
[[   0    0    0    0    0    0    0    0    0    0    0  261]
 [   0    0    0    0    0    0    0    0    0    0    0  245]
 [   0    0    0    0    0    0    0    0    0    0    0  260]
 [   0    0    0    0    0    0    0    0    0    0    0  264]
 [   0    0    0    0    0    0    0    0    0    0    0  247]
 [   0    0    0    0    0    0    0    0    0    0    0  256]
 [   0    0    0    0    0    0    0    0    0    0    0  257]
 [   0    0    0    0    0    0    0    0    0    0    0  256]
 [   0    0    0    0    0    0    0    0    0    0    0  246]
 [   0    0    0    0    0    0    0    0    0    0    0  260]
 [   0    0    0    0    0    0    0    0    0    0    0   39]
 [   0    0    0    0    0    0    0    0    0    0    0 4113]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       261
           1       0.00      0.00      0.00       245
  

Epoch 3/10:   4%|▍         | 542/12570 [00:33<12:29, 16.06batch/s]


KeyboardInterrupt: 

In [19]:
from transformers import Wav2Vec2Config, Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import confusion_matrix, classification_report
from tqdm import tqdm
from torch.cuda.amp import GradScaler, autocast

class SpeechClassifier:
    def __init__(self, model_name='facebook/wav2vec2-base-960h', device=None, lr=0.001, num_labels=12):
        self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
        
        # Load configuration for Wav2Vec2
        config = Wav2Vec2Config.from_pretrained(model_name)
        config.num_labels = num_labels  # Setting the number of classes
        
        self.model = Wav2Vec2ForCTC.from_pretrained(model_name, config=config)
        self.model.to(self.device)
        self.loss_function = torch.nn.CrossEntropyLoss()  # Using CrossEntropyLoss
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        self.scaler = GradScaler()

    def load_data(self, train_path, val_path, batch_size=4):
        train_dataset, train_labels = torch.load(train_path)
        val_dataset, val_labels = torch.load(val_path)
        self.train_loader = DataLoader(TensorDataset(train_dataset, train_labels.squeeze(1)), batch_size=batch_size, shuffle=True)
        self.val_loader = DataLoader(TensorDataset(val_dataset, val_labels.squeeze(1)), batch_size=batch_size, shuffle=False)

    def train_and_validate(self, epochs, accumulation_steps=4):
        for epoch in range(epochs):
            self.model.train()
            total_train_loss = 0
            train_correct = 0
            total_train = 0
            with tqdm(total=len(self.train_loader), desc=f"Epoch {epoch+1}/{epochs}", unit="batch") as pbar:
                for step, (inputs, labels) in enumerate(self.train_loader):
                    processed = self.processor(inputs.numpy(), return_tensors="pt", sampling_rate=16000)
                    inputs = processed.input_values.to(self.device)
                    labels = labels.to(self.device)

                    with autocast():
                        outputs = self.model(inputs).logits[:, -1, :]
                        loss = self.loss_function(outputs, labels) / accumulation_steps

                    self.scaler.scale(loss).backward()
                    if (step + 1) % accumulation_steps == 0:
                        self.scaler.step(self.optimizer)
                        self.scaler.update()
                        self.optimizer.zero_grad()

                    total_train_loss += loss.item() * accumulation_steps
                    _, preds = torch.max(outputs, 1)
                    train_correct += torch.sum(preds == labels).item()
                    total_train += labels.size(0)
                    pbar.update(1)

            train_accuracy = train_correct / total_train
            print(f"Average Training Loss: {total_train_loss / len(self.train_loader)}")
            print(f"Training Accuracy: {train_accuracy}")
            val_loss, val_accuracy = self.validate()
            print(f"Validation Loss: {val_loss}")
            print(f"Validation Accuracy: {val_accuracy}")

    def validate(self):
        self.model.eval()
        total_val_loss = 0
        val_correct = 0
        total_val = 0
        with tqdm(total=len(self.val_loader), desc="Validating", unit="batch") as pbar:
            for inputs, labels in self.val_loader:
                processed = self.processor(inputs.numpy(), return_tensors="pt", sampling_rate=16000)
                inputs = processed.input_values.to(self.device)
                labels = labels.to(self.device)

                with torch.no_grad(), autocast():
                    outputs = self.model(inputs).logits[:, -1, :]
                    loss = self.loss_function(outputs, labels)
                    total_val_loss += loss.item()

                    _, preds = torch.max(outputs, 1)
                    val_correct += torch.sum(preds == labels).item()
                    total_val += labels.size(0)

                pbar.update(1)

        val_accuracy = val_correct / total_val
        return total_val_loss / len(self.val_loader), val_accuracy

    def evaluate_performance(self, total_val, val_correct):
        print("Confusion Matrix:")
        print(confusion_matrix(total_val.cpu().numpy(), val_correct.cpu().numpy()))
        print("\nClassification Report:")
        print(classification_report(total_val.cpu().numpy(), val_correct.cpu().numpy()))

# Example usage remains the same


In [20]:
classifier = SpeechClassifier()
classifier.load_data(train_path, val_path)
classifier.train_and_validate(epochs=10)

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

Average Training Loss: 1.6845521748753511
Training Accuracy: 0.6200922868849198


Validating: 100%|██████████| 1676/1676 [00:29<00:00, 56.49batch/s]


Validation Loss: 1.597636002060337
Validation Accuracy: 0.6135143198090692


Epoch 2/10: 100%|██████████| 12570/12570 [12:40<00:00, 16.54batch/s]


Average Training Loss: 1.5708410222092222
Training Accuracy: 0.6276303751143641


Validating: 100%|██████████| 1676/1676 [00:29<00:00, 56.09batch/s]


Validation Loss: 1.580131184798721
Validation Accuracy: 0.6135143198090692


Epoch 3/10: 100%|██████████| 12570/12570 [12:40<00:00, 16.53batch/s]


Average Training Loss: 1.5567031465832355
Training Accuracy: 0.6283463940490871


Validating: 100%|██████████| 1676/1676 [00:26<00:00, 64.04batch/s]


Validation Loss: 1.5897494659788
Validation Accuracy: 0.6135143198090692


Epoch 4/10: 100%|██████████| 12570/12570 [11:55<00:00, 17.58batch/s]


Average Training Loss: 1.5500179591213035
Training Accuracy: 0.6287640717610088


Validating: 100%|██████████| 1676/1676 [00:27<00:00, 61.11batch/s]


Validation Loss: 1.5923604305148977
Validation Accuracy: 0.6135143198090692


Epoch 5/10: 100%|██████████| 12570/12570 [11:51<00:00, 17.66batch/s]


Average Training Loss: 1.544301914315235
Training Accuracy: 0.6289231870798361


Validating: 100%|██████████| 1676/1676 [00:26<00:00, 62.07batch/s]


Validation Loss: 1.5801552565399388
Validation Accuracy: 0.6135143198090692


Epoch 6/10: 100%|██████████| 12570/12570 [11:49<00:00, 17.71batch/s]


Average Training Loss: 1.543590589279017
Training Accuracy: 0.6290027447392498


Validating: 100%|██████████| 1676/1676 [00:26<00:00, 62.66batch/s]


Validation Loss: 1.5789320042139021
Validation Accuracy: 0.6135143198090692


Epoch 7/10: 100%|██████████| 12570/12570 [11:53<00:00, 17.62batch/s]


Average Training Loss: 1.5421758232936085
Training Accuracy: 0.6287640717610088


Validating: 100%|██████████| 1676/1676 [00:27<00:00, 61.27batch/s]


Validation Loss: 1.5827754710340842
Validation Accuracy: 0.6135143198090692


Epoch 8/10: 100%|██████████| 12570/12570 [11:59<00:00, 17.46batch/s]


Average Training Loss: 1.5400415189131538
Training Accuracy: 0.6290027447392498


Validating: 100%|██████████| 1676/1676 [00:32<00:00, 52.33batch/s]


Validation Loss: 1.578565611190614
Validation Accuracy: 0.6135143198090692


Epoch 9/10: 100%|██████████| 12570/12570 [13:45<00:00, 15.23batch/s]


Average Training Loss: 1.5393005249703224
Training Accuracy: 0.6289828553243964


Validating: 100%|██████████| 1676/1676 [00:30<00:00, 54.80batch/s]


Validation Loss: 1.575167853963119
Validation Accuracy: 0.6135143198090692


Epoch 10/10: 100%|██████████| 12570/12570 [12:56<00:00, 16.20batch/s]


Average Training Loss: 1.5389844823092247
Training Accuracy: 0.6290425235689566


Validating: 100%|██████████| 1676/1676 [00:29<00:00, 56.74batch/s]

Validation Loss: 1.577851325060132
Validation Accuracy: 0.6135143198090692



