In [1]:
# we need to install HuggingFace datasets library
!pip install datasets

In [2]:
import pandas as pd
import numpy as np

# to disable wandb logging
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

import torch

# Transformers
# installed with pip command above
from datasets import Dataset

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### reading the entire training dataset

In [3]:
TRAIN_FILE = "/kaggle/input/nlp-getting-started/train.csv"

orig_train = pd.read_csv(TRAIN_FILE)

# num of distinct labels in target
NUM_LABELS = orig_train['target'].nunique()

### split in train/validation set and keep only useful columns

In [4]:
# do the train/validation split

SEED = 1432
VALID_FRAC = 0.2
USED_COLUMNS = ['text', 'target']


train_df, valid_df = train_test_split(orig_train, test_size=VALID_FRAC, random_state=42)

train_df = train_df[USED_COLUMNS]
valid_df = valid_df[USED_COLUMNS]

print(f"There are {train_df.shape[0]} samples in train set")
print(f"There are {valid_df.shape[0]} samples in valid set")

# rename rating to label
train_df = train_df.rename(columns={"target": "label"})
valid_df = valid_df.rename(columns={"target": "label"})

### create HF datasets

to prepare data fro the training we need a train and a validation dataset where text has been transformed in token and encoded

In [5]:
# start building the dataset objects expected from transformers
ds_train = Dataset.from_pandas(train_df.reset_index(drop=True))
ds_valid = Dataset.from_pandas(valid_df.reset_index(drop=True))

In [6]:
ds_train.features

### Tokenization

In [7]:
# here we define the pre-trained transformer we are using. In this NB we will be using roberta-large and corresponding tokenizer
MODEL_CKPT = "roberta-large"

tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)

In [8]:
# this function will be applied to both set for tokenization to add columns with token encoded
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=50)

In [9]:
# and here we have the final HF datasets
ds_train_encoded = ds_train.map(tokenize, batched=True, batch_size=None)
ds_valid_encoded = ds_valid.map(tokenize, batched=True, batch_size=None)

In [10]:
# have a look
ds_train_encoded

as we can see we have two columns added: input_ids and attention mask, that will be used during the training

In [11]:
# prepare the training on GPU (if available)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = (AutoModelForSequenceClassification.from_pretrained(MODEL_CKPT, num_labels=NUM_LABELS).to(device))

In [12]:
# this function is used to compute the metrics (accuracy, f1-score) that will be computed during validation phases

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    
    return {"accuracy": acc, "f1": f1}

In [13]:
# params for training
BATCH_SIZE = 32

# be careful, with 5 epochs local storage will be filled and you run out of space
EPOCHS = 3
LR = 1e-5
W_DECAY = 0.01

# to disable wandb logging ---> best, see report_to
# os.environ["WANDB_DISABLED"] = "true"

logging_steps = len(ds_train_encoded) // BATCH_SIZE

model_name = f"{MODEL_CKPT}-finetuned-tweets"

training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=EPOCHS,
                                  # changed
                                  learning_rate=LR,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE,
                                  weight_decay=W_DECAY,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=False, 
                                  log_level="error",
                                  load_best_model_at_end=True,
                                  # to disable wandb logging
                                  report_to="none"
                                 )

In [14]:
#
# and here we will do the training
#
trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=ds_train_encoded,
                  eval_dataset=ds_valid_encoded,
                  tokenizer=tokenizer)
trainer.train();

In [15]:
# compute f1 score on the best model (chosen on valid_loss)
f1 = trainer.predict(ds_valid_encoded).metrics['test_f1']
    
print(f"F1 score is: {round(f1, 4)}")

### final remarks:
* as you can see, the score is good: F1 = 0.84; Obviously here it depends on the train/valid split done
* a better result could be obtained, for example, using k-fold split; The final result is a set of k models and you need to avg predictions

I have not put here the code to do predictions on the test set. It is not difficult. Only one thing: you need to do it in batches (to avoid OOM on GPU). It is left as an exercise.