In [5]:
# Читаем данные
import pandas as pd


data = pd.read_csv('data/train.csv', index_col=0)

data.dropna(inplace=True)
cls_map = {'Extremely Negative': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3, 'Extremely Positive' :4}
data['Sentiment'].replace(cls_map, inplace=True)
data.rename(columns={"Text": "text", "Sentiment": "labels"}, inplace=True)
data.head()


Unnamed: 0,text,labels
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,2
1,advice Talk to your neighbours family to excha...,3
2,Coronavirus Australia: Woolworths to give elde...,3
3,My food stock is not the only one which is emp...,3
4,"Me, ready to go at supermarket during the #COV...",0


In [6]:
num_labels = len(data['labels'].unique())

In [7]:
# Делим на обучающую и тестовую
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32924 entries, 9389 to 15795
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    32924 non-null  object
 1   labels  32924 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 771.7+ KB


In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8231 entries, 14623 to 9728
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    8231 non-null   object
 1   labels  8231 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 192.9+ KB


In [10]:
# Конвертируем наборы дынных в структуру, схлжую с тем, что возвращает библиотека datasets
from datasets import load_dataset

train_df.to_csv('data/train_df.csv', index=False)
test_df.to_csv('data/test_df.csv', index=False)
raw_datasets = load_dataset('csv', data_files={'train': 'data/train_df.csv', 'test': 'data/test_df.csv'})


Downloading data files: 100%|██████████| 2/2 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 112.00it/s]
Generating train split: 32924 examples [00:00, 324580.89 examples/s]
Generating test split: 8231 examples [00:00, 249814.15 examples/s]


In [11]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 32924
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 8231
    })
})


In [12]:
# Токенезируем тексты
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


Map: 100%|██████████| 32924/32924 [00:12<00:00, 2667.92 examples/s]
Map: 100%|██████████| 8231/8231 [00:02<00:00, 2773.44 examples/s]


In [13]:
# Удаляем колонку "text" т.к. она больше не нужна
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

tokenized_datasets.set_format("torch")

In [14]:
print(tokenized_datasets["test"])

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 8231
})


In [15]:
# Создаем загрузчики данных
from torch.utils.data import DataLoader

train_ds = tokenized_datasets["train"]
eval_ds = tokenized_datasets["test"]

train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(eval_ds, batch_size=8)

In [16]:
# Создаем предобученную модель
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=num_labels)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Оптимизатор
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [18]:
# LR Scheduler
from ignite.contrib.handlers import PiecewiseLinear

num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)

milestones_values = [
        (0, 5e-5),
        (num_training_steps, 0.0),
    ]
lr_scheduler = PiecewiseLinear(
        optimizer, param_name="lr", milestones_values=milestones_values
    )

In [19]:
# Set device
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# device = torch.device("cpu")
model.to(device)
print(device)

cuda


In [20]:
# Ignite's [`Engine`](https://pytorch-ignite.ai/concepts/01-engine/) allows users to define
# a `process_function` to process a given batch of data. This function is applied to all
# the batches of the dataset. This is a general class that can be applied to train and validate models.
#  A `process_function` has two parameters `engine` and `batch`.


def train_step(engine, batch):  
    model.train()
    
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

    return loss

In [21]:
from ignite.engine import Engine

trainer = Engine(train_step)

In [22]:
from ignite.engine import Events

trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler)

<ignite.engine.events.RemovableEventHandle at 0x1e01cad26e0>

In [23]:
# ProgressBar
from ignite.contrib.handlers import ProgressBar

pbar = ProgressBar()
pbar.attach(trainer)
pbar.attach(trainer, output_transform=lambda x: {'loss': x})

In [24]:
# Create Evaluator
# Similar to the training `process_function`, we setup a function to evaluate a single batch
#  of train/validation/test data.

def evaluate_step(engine, batch):
    model.eval()

    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    return {'y_pred': predictions, 'y': batch["labels"]}

In [25]:
# Below we create two engines, a training evaluator and a validation evaluator.
# `train_evaluator` and `validation_evaluator` use the same function but they serve
#  different purposes as we will see later in this tutorial.

train_evaluator = Engine(evaluate_step)
validation_evaluator = Engine(evaluate_step)

In [26]:
# Attach Metrics
from ignite.metrics import Accuracy

def thresholded_output_transform(output):
    y_pred, y = output
    y_pred = torch.round(y_pred)
    return y_pred, y

metric = Accuracy(output_transform=thresholded_output_transform)

metric.attach(train_evaluator, "accuracy")
metric.attach(validation_evaluator, "accuracy")


In [27]:
# Log Metrics
@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(engine):
    train_evaluator.run(train_dataloader)
    metrics = train_evaluator.state.metrics
    avg_accuracy = metrics['accuracy']
    print(f"Training Results - Epoch: {engine.state.epoch}  Avg accuracy: {avg_accuracy:.3f}")
    
def log_validation_results(engine):
    validation_evaluator.run(eval_dataloader)
    metrics = validation_evaluator.state.metrics
    avg_accuracy = metrics['accuracy']
    print(f"Validation Results - Epoch: {engine.state.epoch}  Avg accuracy: {avg_accuracy:.3f}")

trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results)

<ignite.engine.events.RemovableEventHandle at 0x1e01cad30d0>

In [28]:
# Early Stopping
from ignite.handlers import EarlyStopping

def score_function(engine):
    val_accuracy = engine.state.metrics['accuracy']
    return val_accuracy

handler = EarlyStopping(patience=2, score_function=score_function, trainer=trainer)
validation_evaluator.add_event_handler(Events.COMPLETED, handler)

<ignite.engine.events.RemovableEventHandle at 0x1e01ca4fbe0>

In [29]:
# Model Checkpoint
from ignite.handlers import ModelCheckpoint

checkpointer = ModelCheckpoint(dirname='models', filename_prefix='bert-base-cased', n_saved=2, create_dir=True)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'model': model})

<ignite.engine.events.RemovableEventHandle at 0x1e073513b80>

In [30]:
# Begin Training!
trainer.run(train_dataloader, max_epochs=num_epochs)

Epoch [1/10]: [1/4116]   0%|          , loss=1.94 [00:00<?]