In [2]:
import pathlib

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import transformers
from sklearn.metrics import accuracy_score

from emonet import RATINGS, DATA_DIR
from emonet.data_loader import RandomSegment, SBAugment, get_datasets


In [121]:
tsfmr = nn.Sequential(
        RandomSegment(seconds=29, sample_rate=16000),
    )

data = get_datasets('Michelle Lyn', DATA_DIR, transform=tsfmr)
train_dl = DataLoader(data['train'], 4)
valid_dl = DataLoader(data['valid'], 4)

In [130]:
class EmotionClassifier(pl.LightningModule):
    def __init__(self):
        super().__init__()
        # todo make feature model an argument
        self.wav2vec = transformers.Wav2Vec2Model.from_pretrained('audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim')
        self.n_ratings = len(RATINGS)
        self.nodes_fc1 = 128  # todo make this and any other nodes args
        self.loss_func = nn.CrossEntropyLoss()

        self.train_pipe = nn.Sequential(
            RandomSegment(seconds=5), # todo parameterize
            # SBAugment(perturb_prob=0.2, drop_freq_prob=0.2, drop_chunk_prob=0.2, speeds=[90, 110])
        )

        self.valid_pipe = nn.Sequential(
            RandomSegment(seconds=5),
        )

        # todo experiment with dropout, etc.
        self.anger = self.anger = nn.Sequential(  # todo maybe add apdativeAvgPooling to get consistent size
            nn.LazyLinear(out_features=self.nodes_fc1),
            nn.ReLU(),
            nn.Linear(in_features=self.nodes_fc1, out_features=self.n_ratings)
        )

        self.fear = nn.Sequential(
            nn.LazyLinear(out_features=self.nodes_fc1),
            nn.ReLU(),
            nn.Linear(in_features=self.nodes_fc1, out_features=self.n_ratings)
        )

        self.sadness = nn.Sequential(
            nn.LazyLinear(out_features=self.nodes_fc1),
            nn.ReLU(),
            nn.Linear(in_features=self.nodes_fc1, out_features=self.n_ratings)
        )

    def multi_loss(self, pred, actual):
        loss = torch.zeros(1)
        for key in actual:
            loss += self.loss_func(pred[key], actual[key])  # todo please fix me
        return loss

    def pred_probas(self, outputs):
        return {k: F.softmax(v, dim=1) for k, v in outputs.items()}

    def pred_labels(self, outputs):
        return {k: v.argmax(1) for k, v in self.pred_probas(outputs).items()}

    def training_step(self, batch, idx):
        x, labels = batch
        bs = x.shape[0]
        x = self.train_pipe(x)
        feat = self.wav2vec(x)
        x = feat.extract_features  # todo note the feature output changes based on input length
        x.reshape(bs, -1)
        outputs = {
            'anger': self.anger(x),
            'fear': self.fear(x),
            'sadness': self.sadness(x)
        }
        loss = self.multi_loss(outputs, labels)
        preds = self.pred_labels(outputs)
        acc = accuracy_score(list(labels.values()), list(preds.values()))
        # logs metrics for each training_step,
        # and the average across the epoch, to the progress bar and logger
        self.log_dict({'train_loss': loss, 'train_acc': acc}, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, idx):
        x, labels = batch
        bs = x.shape[0]
        x = self.valid_pipe(x)  # todo right now this is only scoring a single, 5-second sample. change to score multiple and averge preds
        feat = self.wav2vec(x)
        x = feat.extract_features  # todo note the feature output changes based on input length
        x.reshape(bs, -1)
        outputs = {
            'anger': self.anger(x),
            'fear': self.fear(x),
            'sadness': self.sadness(x)
        }
        loss = self.multi_loss(outputs, labels)
        preds = self.pred_labels(outputs)
        acc = accuracy_score(list(labels.values()), list(preds.values()))
        # logs metrics for each training_step,
        # and the average across the epoch, to the progress bar and logger
        self.log_dict({'val_loss': loss, 'val_acc': acc}, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, idx):
        x, labels = batch
        bs = x.shape[0]
        x = self.valid_pipe(x)
        feat = self.wav2vec(x)
        x = feat.extract_features  # todo note the feature output changes based on input length
        x.reshape(bs, -1)
        outputs = {
            'anger': self.anger(x),
            'fear': self.fear(x),
            'sadness': self.sadness(x)
        }
        loss = self.multi_loss(outputs, labels)
        preds = self.pred_labels(outputs)
        acc = accuracy_score(list(labels.values()), list(preds.values()))
        # logs metrics for each training_step,
        # and the average across the epoch, to the progress bar and logger
        self.log_dict({'test_loss': loss, 'test_acc': acc}, on_step=True, on_epoch=True,
                      prog_bar=True, logger=True)
        return loss

    def forward(self, x):
        raise NotImplementedError
        
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.02)

In [131]:
model = EmotionClassifier()

Some weights of the model checkpoint at audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim were not used when initializing Wav2Vec2Model: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [132]:
trainer = pl.Trainer(max_epochs=1)
trainer.fit(model, train_dl, valid_dl)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


RuntimeError: Can't access the shape of an uninitialized parameter or buffer. This error usually happens in `load_state_dict` when trying to load an uninitialized parameter into an initialized one. Call `forward` to initialize the parameters before accessing their attributes.

In [97]:
from speechbrain.lobes.augment import TimeDomainSpecAugment

import random
from emonet import EMOTIONS, RATINGS, SAMPLE_RATE


def get_random_segment(wav: torch.Tensor, seconds=7, sample_rate=SAMPLE_RATE):
    buffer = seconds * sample_rate
    end = wav.shape[-1] - buffer  # should pull timesteps if dims=1 or dims=2, provided following (batch, timestep, channel) format
    start = random.randint(0, end)
    if wav.ndim > 1:
        return wav[:, start:start+buffer]  # assumes (timestamp, channel)
    return wav[start:start+buffer]


class RandomSegment(nn.Module):
    def __init__(self, seconds: int, sample_rate: int = SAMPLE_RATE):
        self.seconds = seconds
        self.sample_rate = sample_rate
        super().__init__()

    def forward(self, x):
        return get_random_segment(x, seconds=self.seconds, sample_rate=self.sample_rate)


class SBAugment(TimeDomainSpecAugment):

    def forward(self, x):
        x = x[:, :, None]  # speechbrain expects tensor shape (batch, timesteps, channels)
        lengths = torch.ones(x.shape[0])
        x = self.speed_perturb(x)
        x = self.drop_freq(x)
        x = self.drop_chunk(x, lengths)
        return x.squeeze(-1)  # drop last dim


In [96]:
sb = SBAugment(perturb_prob=0.2, drop_freq_prob=0.2, drop_chunk_prob=0.2, speeds=[90, 110])
sb(trans).shape

torch.Size([4, 80000])

In [6]:
it = iter(valid_dl)
batch = next(it)

In [7]:
x, labels = batch

In [49]:
x

tensor([[-0.0897, -0.0768, -0.0551,  ...,  0.0060,  0.0073,  0.0025],
        [-0.0139, -0.0171, -0.0268,  ...,  0.0067,  0.0010, -0.0114],
        [ 0.0179,  0.0188,  0.0190,  ...,  0.1192,  0.1345,  0.1336],
        [-0.0044,  0.0084,  0.0142,  ...,  0.0002,  0.0008, -0.0170]])

In [50]:
x.shape

torch.Size([4, 464000])

In [55]:
trans.shape

torch.Size([4, 80000])

In [98]:
train_pipe = nn.Sequential(
            RandomSegment(seconds=5), # todo parameterize
            SBAugment(perturb_prob=0.2, drop_freq_prob=0.2, drop_chunk_prob=0.2, speeds=[90, 110])
        )

In [99]:
trans = train_pipe(x)

In [100]:
trans.shape

torch.Size([4, 88000])

In [101]:
wav2vec = transformers.Wav2Vec2Model.from_pretrained('audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim')

Some weights of the model checkpoint at audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim were not used when initializing Wav2Vec2Model: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [102]:
feat = wav2vec(trans)

In [103]:
xf = feat.extract_features

In [104]:
xf.shape

torch.Size([4, 274, 512])

In [108]:
xf = xf.reshape(4, -1)

In [126]:
xf.shape

torch.Size([4, 140288])

In [107]:
anger = nn.Sequential(  # todo maybe add apdativeAvgPooling to get consistent size
            nn.LazyLinear(out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=4)
        )



In [109]:
outputs = {
            'anger': anger(xf),
            'fear': anger(xf),
            'sadness': anger(xf),
        }

In [110]:
outputs

{'anger': tensor([[ 0.2479, -0.1088,  0.1016, -0.1237],
         [-0.1646,  0.8251,  0.4551, -0.1429],
         [ 0.0192,  0.2802,  0.1375,  0.0393],
         [ 0.1703,  0.4405,  0.6012,  0.0890]], grad_fn=<AddmmBackward0>),
 'fear': tensor([[ 0.2479, -0.1088,  0.1016, -0.1237],
         [-0.1646,  0.8251,  0.4551, -0.1429],
         [ 0.0192,  0.2802,  0.1375,  0.0393],
         [ 0.1703,  0.4405,  0.6012,  0.0890]], grad_fn=<AddmmBackward0>),
 'sadness': tensor([[ 0.2479, -0.1088,  0.1016, -0.1237],
         [-0.1646,  0.8251,  0.4551, -0.1429],
         [ 0.0192,  0.2802,  0.1375,  0.0393],
         [ 0.1703,  0.4405,  0.6012,  0.0890]], grad_fn=<AddmmBackward0>)}

In [114]:
ce = nn.CrossEntropyLoss()

def multi_loss(pred, actual):
        loss = torch.zeros(1)
        for key in actual:
            loss += ce(pred[key], actual[key])  # todo please fix me
        return loss

In [115]:
labels

{'anger': tensor([0, 0, 0, 0]),
 'fear': tensor([0, 1, 2, 0]),
 'sadness': tensor([1, 0, 1, 1])}

In [116]:
multi_loss(outputs, labels)

tensor([4.2667], grad_fn=<AddBackward0>)