In [1]:
import pandas as pd
import json
import os

import torch
import torch.nn.functional as F
from torchcodec.decoders import AudioDecoder
from torchinfo import summary

from vox_profile_release.src.model.emotion.whisper_emotion import WhisperWrapper
from utils import slice_range, to_nested_json

In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
device

'cuda:0'

In [3]:
SAMPLING_RATE = 16000
MAX_DURATION = 15
CHANNELS = 1
MAX_LENGTH = SAMPLING_RATE * MAX_DURATION

In [4]:
rec_path = os.path.join('inputs', 'rec_pre.mp3')
seg_path = os.path.join('outputs', 'segments.parquet')
out_path = os.path.join('outputs', 'segments_emotion')
segments = pd.read_parquet(seg_path)

In [5]:
model = WhisperWrapper.from_pretrained('tiantiaf/whisper-large-v3-msp-podcast-emotion').to(device)

Some weights of WhisperModel were not initialized from the model checkpoint at openai/whisper-large-v3 and are newly initialized because the shapes did not match:
- model.encoder.embed_positions.weight: found shape torch.Size([1500, 1280]) in the checkpoint and torch.Size([750, 1280]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model.eval()

WhisperWrapper(
  (backbone_model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(750, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
     

In [7]:
decoder = AudioDecoder(rec_path, sample_rate=SAMPLING_RATE, num_channels=CHANNELS)
audio = [decoder.get_samples_played_in_range(start, end).data.squeeze(0) for start, end, _ in segments.to_numpy()]
length = torch.tensor([seg.size(0) for seg in audio], dtype=torch.int32)

In [8]:
logits = []
arousal = []
valence = []
dominance = []

for i, j in slice_range(len(segments.index), 10):
    l, _, a, v, d = model.forward(audio[i:j], length=length[i:j], return_feature=False)

    logits.append(l.detach().cpu())
    arousal.append(a.detach().cpu())
    valence.append(v.detach().cpu())
    dominance.append(d.detach().cpu())

In [9]:
(
    segments['emotions.anger'],
    segments['emotions.contempt'],
    segments['emotions.disgust'],
    segments['emotions.fear'],
    segments['emotions.happiness'],
    segments['emotions.neutral'],
    segments['emotions.sadness'],
    segments['emotions.surprise'],
    segments['emotions.other']
) = F.softmax(torch.cat(logits, dim=0), dim=1).numpy().T
segments['dimensions.arousal'] = torch.cat(arousal, dim=0).squeeze().numpy()
segments['dimensions.valence'] = torch.cat(valence, dim=0).squeeze().numpy()
segments['dimensions.dominance'] = torch.cat(dominance, dim=0).squeeze().numpy()
segments['text'] = segments.pop('text')

In [10]:
segments.to_parquet(out_path + '.parquet')
to_nested_json(segments, out_path + '.json')