In [1]:
id2label = {'0': "World",
            '1': "Sports",
            '2': "Business",
            '3': "Sci/Tech"}

In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
base_dir_bert = "use_case/hf-bert-ag_news-0-1-2-split66"

df_train_filtered = pd.read_csv(os.path.join(base_dir_bert,"dataset","df_train_0_1_2_split_66.csv"))
df_test_filtered = pd.read_csv(os.path.join(base_dir_bert,"dataset","df_test_0_1_2_split_66.csv"))
df_new_unseen = pd.read_csv(os.path.join(base_dir_bert,"dataset","df_new_unseen_0_1_2_split_66.csv"))
df_drifted = pd.read_csv(os.path.join(base_dir_bert,"dataset","df_drifted_3_split_66.csv"))

In [4]:
import re

def clean_text(text):
    text = re.sub("@\S+", " ", text) # Remove Mentions
    text = re.sub("https*\S+", " ", text) # Remove URL
    text = re.sub("#\S+", " ", text) # Remove Hastags
    text = re.sub('&lt;/?[a-z]+&gt;', '', text) # Remove special Charaters
    text = re.sub('#39', ' ', text) # Remove special Charaters
    text = re.sub('<.*?>', '', text) # Remove html
    text = re.sub(' +', ' ', text) # Merge multiple blank spaces
    return text

def get_label_name(label_id):
    labels = ["World", "Sports", "Business", "Sci/Tech"]
    return labels[label_id]

In [5]:
label_list = [0, 1, 2, 3]
training_label_list = [0, 1, 2]
drift_label_list = [3]

In [6]:
import os

base_dir = "use_case/hf-roberta-ag_news-0-1-2-split66"

df_train_filtered.to_csv(os.path.join(base_dir,"dataset","df_train_0_1_2_split_66.csv"))
df_test_filtered.to_csv(os.path.join(base_dir,"dataset","df_test_0_1_2_split_66.csv"))
df_new_unseen.to_csv(os.path.join(base_dir,"dataset","df_new_unseen_0_1_2_split_66.csv"))
df_drifted.to_csv(os.path.join(base_dir,"dataset","df_drifted_3_split_66.csv"))

In [7]:
print(df_train_filtered.shape)
print(df_test_filtered.shape)
print(df_new_unseen.shape)


(59480, 4)
(5700, 4)
(30520, 4)


In [8]:
model_name = "roberta-base"

In [9]:
import datasets
from datasets import Dataset, DatasetDict


train_dataset = Dataset.from_pandas(df_train_filtered)
validation_dataset = Dataset.from_pandas(df_test_filtered)


ds = DatasetDict()

ds['train'] = train_dataset.shuffle()
ds['validation'] = validation_dataset.shuffle()

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)


In [11]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)

100%|██████████| 60/60 [00:10<00:00,  5.72ba/s]
100%|██████████| 6/6 [00:00<00:00,  6.42ba/s]


In [12]:
label2id_train = {"World": 0,
                  "Sports": 1,
                  "Business": 2}

In [13]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                                           num_labels=len(training_label_list),
                                                           label2id=label2id_train,
                                                           id2label={id: label for label, id in label2id_train.items()}
                                                          )

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [14]:
import sklearn

def compute_metrics(pred):
    labels = pred.label_ids
    print(pred)
    try:
        preds = pred.predictions.argmax(-1)
    except:
        preds = pred.predictions[0].argmax(-1)
    precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
        labels, preds, average="macro", labels=list(set(labels))
    )
    print(sklearn.metrics.classification_report(labels, preds, digits=4))
    acc = sklearn.metrics.accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=os.path.join(base_dir,"saved_model", "checkpoint"),
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    greater_is_better=True,
)

In [16]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'label', 'id', 'input_ids', 'attention_mask'],
        num_rows: 59480
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'text', 'label', 'id', 'input_ids', 'attention_mask'],
        num_rows: 5700
    })
})

In [17]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [18]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Unnamed: 0, text, id. If Unnamed: 0, text, id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 59480
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 11154


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1162,0.122891,0.970175,0.970247,0.970466,0.970175
2,0.0816,0.119653,0.975439,0.975448,0.975482,0.975439
3,0.0514,0.128941,0.974386,0.974387,0.974395,0.974386


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Unnamed: 0, text, id. If Unnamed: 0, text, id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5700
  Batch size = 16


<transformers.trainer_utils.EvalPrediction object at 0x7fdb93243fa0>
              precision    recall  f1-score   support

           0     0.9496    0.9726    0.9610      1900
           1     0.9930    0.9763    0.9846      1900
           2     0.9687    0.9616    0.9651      1900

    accuracy                         0.9702      5700
   macro avg     0.9705    0.9702    0.9702      5700
weighted avg     0.9705    0.9702    0.9702      5700



Saving model checkpoint to use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-3718
Configuration saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-3718/config.json
Model weights saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-3718/pytorch_model.bin
tokenizer config file saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-3718/tokenizer_config.json
Special tokens file saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-3718/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Unnamed: 0, text, id. If Unnamed: 0, text, id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5700
  Batch size = 16


<transformers.trainer_utils.EvalPrediction object at 0x7fdb946fec10>
              precision    recall  f1-score   support

           0     0.9709    0.9647    0.9678      1900
           1     0.9905    0.9868    0.9887      1900
           2     0.9651    0.9747    0.9699      1900

    accuracy                         0.9754      5700
   macro avg     0.9755    0.9754    0.9754      5700
weighted avg     0.9755    0.9754    0.9754      5700



Saving model checkpoint to use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-7436
Configuration saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-7436/config.json
Model weights saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-7436/pytorch_model.bin
tokenizer config file saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-7436/tokenizer_config.json
Special tokens file saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-7436/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Unnamed: 0, text, id. If Unnamed: 0, text, id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5700
  Batch size = 16


<transformers.trainer_utils.EvalPrediction object at 0x7fdb93243400>
              precision    recall  f1-score   support

           0     0.9639    0.9684    0.9661      1900
           1     0.9895    0.9895    0.9895      1900
           2     0.9699    0.9653    0.9676      1900

    accuracy                         0.9744      5700
   macro avg     0.9744    0.9744    0.9744      5700
weighted avg     0.9744    0.9744    0.9744      5700



Saving model checkpoint to use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-11154
Configuration saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-11154/config.json
Model weights saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-11154/pytorch_model.bin
tokenizer config file saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-11154/tokenizer_config.json
Special tokens file saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-11154/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/checkpoint/checkpoint-7436 (score: 0.9754479209615114).


TrainOutput(global_step=11154, training_loss=0.09444044934649481, metrics={'train_runtime': 5872.0206, 'train_samples_per_second': 30.388, 'train_steps_per_second': 1.9, 'total_flos': 4.694995825938432e+16, 'train_loss': 0.09444044934649481, 'epoch': 3.0})

In [19]:
trainer.save_model(os.path.join(base_dir,"saved_model", "best_model"))

Saving model checkpoint to use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/best_model
Configuration saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/best_model/config.json
Model weights saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/best_model/pytorch_model.bin
tokenizer config file saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/best_model/tokenizer_config.json
Special tokens file saved in use_case/hf-roberta-ag_news-0-1-2-split66/saved_model/best_model/special_tokens_map.json


In [20]:
print(
    "\n ----------------- EVALUATION BEST MODEL ON VALIDATION SET ----------------- \n"
)
print(trainer.evaluate())

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Unnamed: 0, text, id. If Unnamed: 0, text, id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5700
  Batch size = 16



 ----------------- EVALUATION BEST MODEL ON VALIDATION SET ----------------- 



<transformers.trainer_utils.EvalPrediction object at 0x7fdb93adf250>
              precision    recall  f1-score   support

           0     0.9709    0.9647    0.9678      1900
           1     0.9905    0.9868    0.9887      1900
           2     0.9651    0.9747    0.9699      1900

    accuracy                         0.9754      5700
   macro avg     0.9755    0.9754    0.9754      5700
weighted avg     0.9755    0.9754    0.9754      5700

{'eval_loss': 0.11965293437242508, 'eval_accuracy': 0.9754385964912281, 'eval_f1': 0.9754479209615114, 'eval_precision': 0.9754819700089796, 'eval_recall': 0.9754385964912281, 'eval_runtime': 59.686, 'eval_samples_per_second': 95.5, 'eval_steps_per_second': 5.981, 'epoch': 3.0}
