Hi, this is just a quick submission without any rocket science involved. I will definitely make more submissions, but please consider this one for now. It should give around `.91` test accuracy, however, there is still no full determinism in both transformers library and pytorch. Note the [deberta-v3-base](https://huggingface.co/microsoft/deberta-v3-base) model I use here is opensource and is a base model.

Dmitrii Evdokimov (dmevdok)

In [None]:
!pip install transformers datasets

In [2]:
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModel, AutoModelForSequenceClassification, DataCollatorWithPadding, DataCollator
import os
import evaluate
from transformers import set_seed
from datasets import load_dataset
import torch
import numpy as np

SEED = 42

class Embedding(torch.nn.Module):

    def __init__(self, trainable=True):
        super().__init__()
        self.embedding = AutoModel.from_pretrained('microsoft/deberta-v3-base', num_labels=3)
        self.trainable = trainable

    def forward(self, input_ids, attention_mask):
        if not self.trainable:
            with torch.no_grad():
                return self.embedding(input_ids, attention_mask=attention_mask, output_hidden_states=True, output_attentions=True)
        else:
            return self.embedding(input_ids, attention_mask=attention_mask, output_hidden_states=True, output_attentions=True)

class Wrapper(torch.nn.Module):

    def __init__(self, wrapped, embedding_size=768, n_classes=3, dropout=0.2, head_dimension=1000, random_state=42):
        super().__init__()
        self.random_state = random_state
        self.wrapped = wrapped
        self.n_transformer_layers = len(wrapped.embedding.encoder.layer)
        self.embedding_size = embedding_size
        self.n_classes = n_classes
        self.head = torch.nn.Sequential(
            torch.nn.Linear(
                self.embedding_size*3,
                self.embedding_size*3
            ),
            torch.nn.ReLU(),
            torch.nn.Linear(
                self.embedding_size*3,
                self.n_classes
            ),
            torch.nn.Dropout(dropout)
        )
        self.head[0].weight.data.normal_(mean=0., std=.02)
        self.head[0].bias.data.zero_()
        self.head[2].weight.data.normal_(mean=0., std=.02)
        self.head[2].bias.data.zero_()
        self.criterion = torch.nn.CrossEntropyLoss()
        
    def forward(self, input_ids, labels, attention_mask):
        wrapped_out = self.wrapped(input_ids=input_ids, attention_mask=attention_mask)
        out = wrapped_out.hidden_states
        out = self.head(
            torch.cat(
                [
                    out[-1][:,0,:],
                    out[-2][:,0,:],
                    out[-3][:,0,:],
                ],
                -1
            )
        )
        out = torch.cat(
            [
                out[:,0,None],
                out[:,1,None],
                out[:,2,None] 
            ],
            -1
        )
        return {
            'loss': self.criterion(out, labels),
            'logits': out
        }

def model_init():
    set_seed(SEED)
    return  Wrapper(
        Embedding()
    )

metric = evaluate.load('accuracy')

os.environ["WANDB_DISABLED"] = "true"

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base", num_labels=3)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

dataset = load_dataset("zeroshot/twitter-financial-news-sentiment").shuffle(SEED)

def tokenize(t):
    return tokenizer(t['text'])

tokenized_data = dataset.map(tokenize, batched=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="test_trainer",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_steps=0.1,
    evaluation_strategy="steps"
)

trainer = Trainer(
    model_init=model_init,
    args = training_args,
    train_dataset = tokenized_data['train'].select(range(9000)),
    eval_dataset = tokenized_data['train'].select(range(9000, 9543)),
    data_collator = data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy
282,No log,0.347893,0.856354
564,0.573500,0.362159,0.882136
846,0.573500,0.395049,0.883978
1128,0.363200,0.482597,0.883978
1410,0.363200,0.457,0.88582
1692,0.260400,0.531736,0.893186
1974,0.260400,0.781823,0.893186
2256,0.177900,0.848472,0.882136
2538,0.124600,0.868692,0.895028


TrainOutput(global_step=2815, training_loss=0.2785524737559669, metrics={'train_runtime': 1165.5339, 'train_samples_per_second': 38.609, 'train_steps_per_second': 2.415, 'total_flos': 0.0, 'train_loss': 0.2785524737559669, 'epoch': 5.0})

## Test

In [3]:
submission_prediction = trainer.predict(tokenized_data['validation'], ignore_keys=['label']).predictions

accuracy_metric = lambda pred, true: sum(true == np.argmax(pred, 1)) / len(true)

accuracy_metric(
    submission_prediction,
    dataset['validation']['label']
)

0.9158291457286433