In [1]:
# 1. Подключение библиотек
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

# 2. Загрузка данных
train_df = pd.read_csv('train.csv')
valid_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

# Преобразуем столбцы в массивы
X_train = train_df['dialog'].values
y_train = train_df['emotion'].values
X_valid = valid_df['dialog'].values
y_valid = valid_df['emotion'].values
X_test = test_df['dialog'].values
y_test = test_df['emotion'].values

y_train = [str(val).split()[0] for val in y_train]  # Берем первое число из массива
y_valid = [str(val).split()[0] for val in y_valid]  # Берем первое число из массива
y_test = [str(val).split()[0] for val in y_test]  # Берем первое число из массива

# Теперь применяем LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_valid = label_encoder.transform(y_valid)
y_test = label_encoder.transform(y_test)


# 4. Создание Dataset для PyTorch
class ChatDataset(Dataset):
    def __init__(self, dialogs, labels, tokenizer, max_len):
        self.dialogs = dialogs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dialogs)

    def __getitem__(self, item):
        dialog = self.dialogs[item]
        label = self.labels[item]

        # Токенизация
        encoding = self.tokenizer(dialog, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

# 5. Создание токенизатора
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 6. Создание DataLoader для обучения и тестирования
max_len = 128  # Максимальная длина последовательности

train_dataset = ChatDataset(X_train, y_train, tokenizer, max_len)
valid_dataset = ChatDataset(X_valid, y_valid, tokenizer, max_len)
test_dataset = ChatDataset(X_test, y_test, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

# 7. Создание модели
class ChatModel(nn.Module):
    def __init__(self, n_classes):
        super(ChatModel, self).__init__()
        # Используем предобученную модель BERT
        from transformers import BertModel
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.pooler_output
        output = self.dropout(pooled_output)
        output = self.fc(output)
        return output

# Количество классов (эмоций)
n_classes = len(label_encoder.classes_)

# Создание модели
model = ChatModel(n_classes)
model = model.cuda() if torch.cuda.is_available() else model

# 8. Определение функции потерь и оптимизатора
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# 9. Функция для тренировки модели
def train_model(model, train_loader, valid_loader, criterion, optimizer, n_epochs=7):
    for epoch in range(n_epochs):
        model.train()
        total_train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].cuda() if torch.cuda.is_available() else batch['input_ids']
            attention_mask = batch['attention_mask'].cuda() if torch.cuda.is_available() else batch['attention_mask']
            labels = batch['label'].cuda() if torch.cuda.is_available() else batch['label']

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_train_loss += loss.item()

            loss.backward()
            optimizer.step()

        # Выводим информацию о потере на каждой эпохе
        print(f'Epoch {epoch + 1}, Train Loss: {total_train_loss / len(train_loader)}')

        # Тестируем модель на валидации
        model.eval()
        total_valid_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in valid_loader:
                input_ids = batch['input_ids'].cuda() if torch.cuda.is_available() else batch['input_ids']
                attention_mask = batch['attention_mask'].cuda() if torch.cuda.is_available() else batch['attention_mask']
                labels = batch['label'].cuda() if torch.cuda.is_available() else batch['label']

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                total_valid_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f'Epoch {epoch + 1}, Validation Loss: {total_valid_loss / len(valid_loader)}')
        print(f'Epoch {epoch + 1}, Validation Accuracy: {correct / total}')

# 10. Тренировка модели
train_model(model, train_loader, valid_loader, criterion, optimizer, n_epochs=7)

# 11. Тестирование модели
def test_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].cuda() if torch.cuda.is_available() else batch['input_ids']
            attention_mask = batch['attention_mask'].cuda() if torch.cuda.is_available() else batch['attention_mask']
            labels = batch['label'].cuda() if torch.cuda.is_available() else batch['label']

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Test Accuracy: {correct / total}')

# 12. Тестируем модель
test_model(model, test_loader)

# 13. Сохранение модели
torch.save(model.state_dict(), 'chat_model.pth')


  from .autonotebook import tqdm as notebook_tqdm


Epoch 1, Train Loss: 0.38764778542057643
Epoch 1, Validation Loss: 0.2262211542665249
Epoch 1, Validation Accuracy: 0.932
Epoch 2, Train Loss: 0.23686790784365302
Epoch 2, Validation Loss: 0.23470048251606168
Epoch 2, Validation Accuracy: 0.921
Epoch 3, Train Loss: 0.16323970554718117
Epoch 3, Validation Loss: 0.24755202553841094
Epoch 3, Validation Accuracy: 0.906
Epoch 4, Train Loss: 0.11652130150929656
Epoch 4, Validation Loss: 0.292826375894485
Epoch 4, Validation Accuracy: 0.915
Epoch 5, Train Loss: 0.08780664074043709
Epoch 5, Validation Loss: 0.27222338402163354
Epoch 5, Validation Accuracy: 0.932
Epoch 6, Train Loss: 0.07137592402069072
Epoch 6, Validation Loss: 0.31697332576316384
Epoch 6, Validation Accuracy: 0.936
Epoch 7, Train Loss: 0.06094767963438355
Epoch 7, Validation Loss: 0.28514823123502236
Epoch 7, Validation Accuracy: 0.921
Test Accuracy: 0.886
