In [2]:
from transformers import Wav2Vec2Model
import torch.nn as nn

class EmotionClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        # Предобученный wav2vec 2.0
        self.wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
        # LSTM для временных зависимостей
        self.lstm = nn.LSTM(768, 128, bidirectional=True)
        # Классификатор
        self.classifier = nn.Sequential(
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 6))

    def forward(self, x):
        x = self.wav2vec(x).last_hidden_state
        x, _ = self.lstm(x)
        x = x.mean(dim=1)  # Временное усреднение
        return self.classifier(x)

In [4]:
import librosa
import torch
import torchaudio
import numpy as np

# Загрузка аудио (16 кГц моно)
audio, sr = librosa.load("/content/eca1ca8908f8ea3.mp3", sr=16000)

# Нормализация и преобразование в тензор
audio_tensor = torch.FloatTensor(audio).unsqueeze(0)  # [1, samples]

# Пример метки (индекс эмоции)
emotion_label = 2  # например, 2 = "грусть"

In [5]:
model = EmotionClassifier()

# Предсказание эмоции
with torch.no_grad():
    logits = model(audio_tensor)
    pred = torch.argmax(logits).item()

emotion_map = ["нейтрально", "радость", "грусть", "гнев", "страх", "отвращение"]
print(f"Предсказанная эмоция: {emotion_map[pred]}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Предсказанная эмоция: гнев


In [6]:
from torch.optim import Adam

criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

# Пример одного шага обучения
def train_step(audio, label):
    optimizer.zero_grad()
    output = model(audio)
    loss = criterion(output, label)
    loss.backward()
    optimizer.step()
    return loss.item()

# Цикл обучения
for epoch in range(10):
    loss = train_step(audio_tensor, torch.tensor([emotion_label]))
    print(f"Epoch {epoch}, Loss: {loss:.4f}")

Epoch 0, Loss: 1.8256
Epoch 1, Loss: 1.6971
Epoch 2, Loss: 1.5453
Epoch 3, Loss: 1.2929
Epoch 4, Loss: 1.1135
Epoch 5, Loss: 0.9076
Epoch 6, Loss: 0.7095
Epoch 7, Loss: 0.5329
Epoch 8, Loss: 0.3766
Epoch 9, Loss: 0.2569
