In [11]:
import torch
import torchvision
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print(f"Using {device} device")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")

In [None]:
ds = load_dataset("google-research-datasets/go_emotions", "simplified")
ds

In [None]:
# Функция для токенизации текста
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# Применение токенизации к данным
tokenized_datasets = ds.map(tokenize_function, batched=True)

In [16]:
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(tokenized_datasets['train']['labels'])
test_labels = mlb.transform(tokenized_datasets['test']['labels'])
val_labels = mlb.transform(tokenized_datasets['validation']['labels'])

In [17]:
# # Разделение данных на тренировочные и валидационные
# train_dataset, val_dataset = train_test_split(tokenized_datasets['train'], test_size=0.1, random_state=42)

In [None]:
# Определение аргументов обучения
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    # fp16=True
)

In [None]:
!pip install 'accelerate>=0.26.0'