In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers[torch] datasets evaluate

In [None]:
import evaluate
import numpy as np

from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, DistilBertTokenizerFast
from datasets import load_dataset, ClassLabel, Value, load_metric
from transformers.trainer_callback import EarlyStoppingCallback

In [None]:
headlines = load_dataset('raquiba/Sarcasm_News_Headline')

In [None]:
headlines['test'][:10]

In [None]:
headlines = headlines.rename_column('is_sarcastic', 'labels')
headlines = headlines.rename_column('headline', 'text')
headlines = headlines.remove_columns('article_link')
headlines['test'][:10]

In [None]:
headlines['train'].features

In [None]:
new_features = headlines['train'].features.copy()
new_features['labels'] = ClassLabel(num_classes=2, names=['neg', 'pos'])
headlines = headlines.cast(new_features)
headlines['train'].features

In [None]:
headlines

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True)

def preprocess_function(examples):
  return tokenizer(examples['text'], truncation=True)

tokenized_headlines = headlines.map(preprocess_function, batched=True)

In [None]:
tokenized_headlines['train'].features

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
#accuracy = evaluate.load('accuracy')
f1_metric = load_metric('f1')
accuracy_metric = load_metric('accuracy')

labels = tokenized_headlines['train'].features['labels'].names

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  #return accuracy.compute(predictions=predictions, reference=labels)
  acc = accuracy_metric.compute(predictions=predictions, reference=labels)
  f1 = f1_metric.compute(predictions=predictions, reference=labels, average='micro')
  return {
      'accuracy': acc['accuracy'],
      'f1': f1['f1']
  }

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
# 1 : POSITIVE - sarcastic
# 0 : NEGATIVE - not sarcastic

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/textrec",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    #evaluation_strategy="epoch",
    #save_strategy="epoch",
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="tensorboard",
    push_to_hub=False,
)

In [None]:
callbacks=[]
callbacks.append(EarlyStoppingCallback(early_stopping_patience=3))

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_headlines['train'],
    eval_dataset=tokenized_headlines['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=callbacks,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

# error message : 'NoneType' object is not iterable