In [1]:
from sentence_transformers import SentenceTransformer
import torch
import pickle
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
import numpy as np

  from tqdm.autonotebook import tqdm, trange


# Создаем класс для работы с данными

In [2]:
class DatasetProcessor:
    def __init__(self, transcription_file, annotation_file=None):
        self.transcription_file = transcription_file
        self.annotation_file = annotation_file
        self.transcriptions = self.load_transcriptions()
        self.annotations = self.load_annotations() if annotation_file else None
        self.embeddings_dict = None
        self.embeddings_tensor = None
        self.targets_tensor = None

    def load_transcriptions(self):
        if isinstance(self.transcription_file, str) and self.transcription_file.endswith('.pkl'):
            with open(self.transcription_file, 'rb') as f:
                transcriptions = pickle.load(f, encoding='latin1')
            return dict(transcriptions.items())
        else:
            return self.transcription_file

    def load_annotations(self):
        with open(self.annotation_file, 'rb') as f:
            annotations = pickle.load(f, encoding='latin1')

        formatted_annotations = {}
        for trait, data in annotations.items():
            for file_name, value in data.items():
                if file_name not in formatted_annotations:
                    formatted_annotations[file_name] = {}
                formatted_annotations[file_name][trait] = value

        return formatted_annotations

    def get_embeddings(self, embedding_class):
        embeddings = {}
        embedder = embedding_class
        for file_name, text in self.transcriptions.items():
            embedding = embedder.get_embeddings(text)
            embeddings[file_name] = embedding
        return embeddings

    def get_annotations(self):
        return self.annotations

    def create_dataset(self, embedding_class):
        """
        Создает датасет с парами (embeddings, targets) для обучения модели.

        Параметры:
        - annotations: словарь, содержащий метрики Big Five для каждого видео.
        - embeddings: словарь, содержащий тензоры эмбеддингов для каждого видео.

        Возвращает:
        - embeddings_tensor: тензор с эмбеддингами текста размерности [batch_size, embedding_dim].
        - targets_tensor: тензор с метриками Big Five размерности [batch_size, 5].
        """
        embeddings = self.get_embeddings(embedding_class)
        self.embeddings_dict = embeddings

        common_ids = set(self.annotations.keys()).intersection(embeddings.keys())

        embedding_list = [embeddings[video_id] for video_id in common_ids]
        target_list = [
            [
                self.annotations[video_id]['extraversion'],
                self.annotations[video_id]['neuroticism'],
                self.annotations[video_id]['agreeableness'],
                self.annotations[video_id]['conscientiousness'],
                self.annotations[video_id]['openness']
            ]
            for video_id in common_ids
        ]

        embeddings_tensor = torch.stack(embedding_list)
        targets_tensor = torch.tensor(target_list, dtype=torch.float32)
        self.embeddings_tensor = embeddings_tensor
        self.targets_tensor = targets_tensor
        return TensorDataset(embeddings_tensor, targets_tensor)

    def check_dataset_integrity(self):
      """
      Проверяет целостность датасета, убеждаясь, что эмбеддинги и метрики Big Five правильно совпадают.
      Также выводит соответствующие данные из target_tensor.
      """
      common_ids = set(self.annotations.keys()).intersection(self.embeddings_dict.keys())

      targets_list = self.targets_tensor.tolist()
      embeddings_list = self.embeddings_tensor.tolist()

      for idx, video_id in enumerate(list(common_ids)[:5]):
          embedding = self.embeddings_dict[video_id]
          metrics = self.annotations[video_id]

          print(f"Video ID: {video_id}")
          print(f"Embedding: {embedding}")
          print(f"Metrics: {metrics}")

          target_data = targets_list[idx]
          print(f"Target data (Big Five): {target_data}")

          print('-' * 40)

# Класс для получения эмбендингов

In [3]:
class TextEmbedder:
    def __init__(self, model_name="all-MiniLM-L6-v2", use_gpu=True):
        self.device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
        self.model = SentenceTransformer(model_name)
        self.model.to(self.device)

    def get_embeddings(self, texts):
        """
        Получение эмбеддингов для одного или нескольких текстов.

        :param texts: Список строк (или одна строка) для обработки.
        :return: Тензор с эмбеддингами размером [num_texts, embedding_dim].
        """
        if isinstance(texts, str):
            texts = [texts]

        embeddings = self.model.encode(texts, convert_to_tensor=True)
        return embeddings.to(self.device)

# Создаем обучающий датасет и проверяем его

In [None]:
process = DatasetProcessor("/content/transcription_training.pkl", "/content/annotation_training.pkl")
dataset = process.create_dataset(TextEmbedder())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
print("dataset", dataset)
process.check_dataset_integrity()

dataset <torch.utils.data.dataset.TensorDataset object at 0x7f40682138b0>
Video ID: 12Ezy1y1cWY.005.mp4
Embedding: tensor([[-3.4414e-02, -1.5790e-02,  1.7753e-02,  3.7227e-02, -1.7211e-02,
         -2.4341e-02, -9.0782e-03,  1.2193e-02,  1.1620e-02,  4.6894e-02,
         -5.2141e-02, -1.3555e-01, -9.4343e-02, -6.6389e-03,  2.5995e-02,
         -3.7868e-03,  6.2280e-02, -9.0940e-02, -2.0512e-02, -1.0435e-02,
          9.0710e-03, -9.1454e-03,  8.6066e-02, -8.4191e-02, -4.9740e-03,
          1.0507e-02, -1.4395e-02,  5.5126e-02,  5.5426e-03, -5.4114e-03,
         -4.3217e-03,  3.0782e-02,  6.7841e-02,  9.7615e-03, -8.2468e-02,
          7.9971e-02,  4.5090e-02, -6.2821e-02,  6.0713e-02,  3.0717e-02,
          6.2216e-02,  5.8248e-02, -3.4622e-02, -8.8114e-03, -2.5191e-02,
         -8.3548e-02, -2.6715e-02, -8.9284e-02, -3.2959e-03, -2.0505e-02,
          2.8708e-03, -6.4731e-03, -1.2318e-02,  6.1234e-02, -4.1374e-02,
          5.6563e-02, -1.2434e-01, -3.7623e-03,  6.4436e-02, -3.7618e-0

# Создаем класс прецептрона

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_absolute_error

class BigFiveModel(nn.Module):
    def __init__(self, embedding_dim=384, hidden_dim=768, output_dim=5, dropout_rate=0.5):
        """
        Инициализация модели BigFiveModel
        embedding_dim (int): Размерность входных векторных представлений
        hidden_dim (int): Размер скрытых слоев
        output_dim (int): Количество классов (Big Five, т.е. 5 характеристик)
        dropout_rate (float): Уровень dropout для регуляризации
        """
        super(BigFiveModel, self).__init__()

        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.layer_norm = nn.LayerNorm(hidden_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.layer_norm(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

class BigFiveTrainer:
    def __init__(self, model, learning_rate=0.001, lr_scheduler=None):
        """
        Инициализация тренера
        model (nn.Module): модель для обучения
        learning_rate (float): начальная скорость обучения
        lr_scheduler (torch.optim.lr_scheduler): планировщик для изменения скорости обучения
        """
        self.model = model
        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        self.lr_scheduler = lr_scheduler or optim.lr_scheduler.StepLR(self.optimizer, step_size=20, gamma=0.1)

    def train(self, train_loader, num_epochs=20, device='cpu'):
        """
        Обучение модели
        train_loader (DataLoader): загрузчик данных для обучения
        num_epochs (int): количество эпох для обучения
        device (str): устройство для вычислений ('cpu' или 'cuda')
        """
        self.model.to(device)
        self.model.train()

        for epoch in range(num_epochs):
            running_loss = 0.0
            for embeddings, targets in train_loader:
                embeddings, targets = embeddings.to(device), targets.to(device)

                self.optimizer.zero_grad()
                outputs = self.model(embeddings)
                loss = self.criterion(outputs, targets)
                loss.backward()
                self.optimizer.step()

                running_loss += loss.item()

            self.lr_scheduler.step()
            print(f"Эпоха [{epoch+1}/{num_epochs}], Потери: {running_loss / len(train_loader):.4f}")
            print(f"Текущий learning rate: {self.optimizer.param_groups[0]['lr']:.6f}")

    def predict(self, embedding, device='cpu'):
        """
        Предсказание для новых данных
        embedding (Tensor): Вектор представления для предсказания
        device (str): Устройство для вычислений ('cpu' или 'cuda')
        """
        self.model.to(device)
        self.model.eval()

        with torch.no_grad():
            embedding = embedding.to(device)
            output = self.model(embedding)

        return output.cpu().numpy()

    def evaluate_mse(self, val_loader, device='cpu'):
        """
        Оценка модели на валидационном датасете с использованием MSE
        val_loader (DataLoader): загрузчик данных для валидации
        device (str): устройство для вычислений ('cpu' или 'cuda')
        """
        self.model.to(device)
        self.model.eval()

        all_targets = []
        all_preds = []

        with torch.no_grad():
            for embeddings, targets in val_loader:
                preds = self.predict(embeddings, device=device)

                preds = preds.squeeze()
                all_preds.extend(preds.tolist())

                all_targets.extend(targets.cpu().numpy().tolist())

        all_targets = np.array(all_targets)
        all_preds = np.array(all_preds)

        if all_targets.shape != all_preds.shape:
            print(f"Несоответствие размерности: {all_targets.shape} vs {all_preds.shape}")
            return None

        mse = mean_squared_error(all_targets, all_preds)
        print(f"MSE на валидационном датасете: {mse:.4f}")
        return mse

    def evaluate_mae(self, val_loader, device='cpu'):
        """
        Оценка модели на валидационном датасете с использованием MAE
        val_loader (DataLoader): загрузчик данных для валидации
        device (str): устройство для вычислений ('cpu' или 'cuda')
        """
        self.model.to(device)
        self.model.eval()

        all_targets = []
        all_preds = []

        with torch.no_grad():
            for embeddings, targets in val_loader:
                preds = self.predict(embeddings, device=device)

                preds = preds.squeeze()

                all_preds.extend(preds.tolist())
                all_targets.extend(targets.cpu().numpy().tolist())

        all_targets = np.array(all_targets)
        all_preds = np.array(all_preds)

        if all_targets.shape != all_preds.shape:
            print(f"Несоответствие размерности: {all_targets.shape} vs {all_preds.shape}")
            return None

        mae = mean_absolute_error(all_targets, all_preds)
        print(f"MAE на валидационном датасете: {mae:.4f}")
        return mae

    def evaluate_mf1(self, val_loader, device='cpu'):
        """
        Оценка модели на валидационном датасете с использованием mF1
        val_loader (DataLoader): загрузчик данных для валидации
        device (str): устройство для вычислений ('cpu' или 'cuda')
        """
        self.model.to(device)
        self.model.eval()

        all_targets = []
        all_preds = []

        with torch.no_grad():
            for embeddings, targets in val_loader:
                preds = self.predict(embeddings, device=device)

                preds_binary = [1 if p >= 0.5 else 0 for p in preds.flatten().tolist()]

                targets_binary = [1 if t >= 0.5 else 0 for t in targets.cpu().numpy().tolist()]

                all_preds.extend(preds_binary)
                all_targets.extend(targets_binary)

        all_targets = np.array(all_targets)
        all_preds = np.array(all_preds)

        if all_targets.shape != all_preds.shape:
            print(f"Несоответствие размерности: {all_targets.shape} vs {all_preds.shape}")
            return None

        mf1 = f1_score(all_targets, all_preds)
        print(f"mF1 на валидационном датасете: {mf1:.4f}")
        return mf1

    def save_weights(self, path="big_five_model.pth"):
        """
        Сохранение весов модели
        path (str): Путь для сохранения весов
        """
        torch.save(self.model.state_dict(), path)
        print(f"Модель сохранена в {path}")

    def load_weights(self, path="big_five_model.pth", device='cpu'):
        """
        Загрузка весов модели
        path (str): Путь для загрузки весов
        device (str): Устройство для загрузки ('cpu' или 'cuda')
        """
        self.model.load_state_dict(torch.load(path, map_location=device))
        print(f"Модель загружена из {path}")



In [None]:

model = BigFiveModel(embedding_dim=384)
trainer = BigFiveTrainer(model)

trainer.train(train_loader=dataset, device="cuda", num_epochs=3)


  return F.mse_loss(input, target, reduction=self.reduction)


Эпоха [1/3], Потери: 0.0304
Текущий learning rate: 0.001000
Эпоха [2/3], Потери: 0.0232
Текущий learning rate: 0.001000
Эпоха [3/3], Потери: 0.0221
Текущий learning rate: 0.001000


In [5]:
model = BigFiveModel(embedding_dim=384)
trainer = BigFiveTrainer(model)
trainer.load_weights("/content/big_five_model_03.pth")

Модель загружена из /content/big_five_model_03.pth


  self.model.load_state_dict(torch.load(path, map_location=device))


# Формируем валидационный датасет

In [6]:
val_process = DatasetProcessor("/content/transcription_validation.pkl", "/content/annotation_validation.pkl")
val_ds = val_process.create_dataset(TextEmbedder())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
trainer.evaluate_mse(val_ds, device="cuda")

MSE на валидационном датасете: 0.0247


0.02469477968685012

In [8]:
trainer.evaluate_mf1(val_ds, device="cuda")

mF1 на валидационном датасете: 0.7406


0.7405832724373879

In [None]:
trainer.save_weights("/content/big_five_model_03.pth")

Модель сохранена в /content/big_five_model_03.pth


In [9]:
trainer.evaluate_mae(val_ds, device="cuda")

MAE на валидационном датасете: 0.1246


0.12463626093566418