In [None]:
# !pip install adapter-transformers
# !pip install datasets
# !pip install evaluate
# !pip install sentencepiece

In [2]:
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os, sys
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
os.environ['CUDA_VISIBLE_DEVICES'] = '3'
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"



import json
from transformers import AutoTokenizer, AdapterConfig, AutoAdapterModel, AutoConfig
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction, TrainerCallback
from transformers import AutoModelForSequenceClassification, Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy, DataCollatorForLanguageModeling
from transformers.adapters.composition import Stack
from datasets import Dataset, DatasetDict

from tqdm import tqdm

import numpy as np
from datasets import concatenate_datasets, load_metric
import evaluate
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import random
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df_subtask_2_es = pd.read_csv("../data/subtask_2/es/train.tsv", sep='\t')
df_subtask_2_es=df_subtask_2_es.drop(df_subtask_2_es.columns[0], axis=1)
df_subtask_2_es

Unnamed: 0,text,label
0,"Sin embargo, los jóvenes son capaces de recono...",B
1,¿Hay algo más que quieras compartir? ¿Algo sob...,B
2,"El servicio de sala es bueno, rápido y amabilí...",B
3,"Para concentrarse en el hablante, trata de des...",F
4,Los responsables locales tendrán ahora que esp...,F
...,...,...
21930,Escribe un correo electrónico a PI:EMAIL y cué...,A
21931,"Los osos negros y grizzlies de Yellowstone, ta...",C
21932,"El lenguaje es un recurso para comunicarse, pe...",A
21933,"Dr, creo que las ostras son animales No, las o...",F


In [4]:
df_subtask_2_en = pd.read_csv("../data/subtask_2/en/train.tsv", sep='\t')
df_subtask_2_en=df_subtask_2_en.drop(df_subtask_2_en.columns[0], axis=1)
df_subtask_2_en

Unnamed: 0,text,label
0,It was not until many years later that it coul...,A
1,Users can then pin these images to their profi...,F
2,The best songs are those that I can sing along...,B
3,I found this book to be poorly written. It was...,D
4,Regulates the application of the EU tariff quo...,E
...,...,...
22411,The ministry had earlier said it was working w...,B
22412,"Once combined, slowly add the melted butter, s...",F
22413,Hightower for this new and wellwritten book!. ...,B
22414,All throughout the whole book this author has ...,D


In [5]:
len(df_subtask_2_en.label.unique())

6

In [6]:
mapping = {
    "A": 0,
    "B": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5
}
df_subtask_2_en["label"] = df_subtask_2_en['label'].map(mapping)

In [7]:
df_subtask_2_en.head()

Unnamed: 0,text,label
0,It was not until many years later that it coul...,0
1,Users can then pin these images to their profi...,5
2,The best songs are those that I can sing along...,1
3,I found this book to be poorly written. It was...,3
4,Regulates the application of the EU tariff quo...,4


In [8]:
df_subtask_2_en.label.unique()

array([0, 5, 1, 3, 4, 2])

In [9]:
from sklearn.model_selection import train_test_split

dataset_train, dataset_test = train_test_split(df_subtask_2_en, test_size=0.1, random_state=42)

print(len(dataset_train))
print(len(dataset_test))

20174
2242


In [10]:
dataset_train.head()

Unnamed: 0,text,label
14563,To hold the screwjoint in place with your left...,5
13244,LOVE the song Nina Someday Ninas song Someday ...,5
3847,The total amount to be disposed off in accorda...,0
19462,The accident sparked a major debate over wheth...,2
15578,PHOTO: REUTERS. LONDON (REUTERS) The head of a...,2


In [11]:
dataset_test.head()

Unnamed: 0,text,label
8059,That way you can see what happens to it as its...,0
20481,It is unclear whether it will be available for...,2
21223,Type control panel and press ↵ Enter. This wil...,5
5194,Some gyms offer a freeze option that allows yo...,5
134,He was speaking after the kingdoms ambassador ...,3


In [12]:
# language_model = "distilbert-base-cased"
# language_model = "prajjwal1/bert-tiny"
# language_model = "microsoft/deberta-v3-base"
language_model = "xlm-roberta-base"
# language_model = "bert-base-multilingual-cased"
# language_model = "roberta-base-openai-detector"
# language_model = "Hello-SimpleAI/chatgpt-detector-roberta"



tokenizer = AutoTokenizer.from_pretrained(language_model, num_labels=len(df_subtask_2_en.label.unique()))



dataset_train = Dataset.from_pandas(dataset_train)
dataset_test = Dataset.from_pandas(dataset_test)

def encode_batch(batch):
    """Encodes a batch of input data using the model tokenizer."""
    return tokenizer(batch["text"], max_length=80, truncation=True, padding="max_length")


dataset_train = dataset_train.rename_column("label", "labels")
dataset_train = dataset_train.map(encode_batch, batched=True)
dataset_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

dataset_test = dataset_test.map(encode_batch, batched=True)
dataset_test = dataset_test.rename_column("label", "labels")
dataset_test.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

                                                                    

In [13]:
dataset_train.to_pandas().head()

Unnamed: 0,text,labels,__index_level_0__,input_ids,attention_mask
0,To hold the screwjoint in place with your left...,5,14563,"[0, 717, 16401, 70, 229319, 513, 4288, 23, 368...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,LOVE the song Nina Someday Ninas song Someday ...,5,13244,"[0, 129138, 70, 11531, 50030, 31384, 5636, 500...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,The total amount to be disposed off in accorda...,0,3847,"[0, 581, 3622, 41170, 47, 186, 60458, 71, 5773...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,The accident sparked a major debate over wheth...,2,19462,"[0, 581, 27998, 131999, 297, 10, 13036, 29865,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,PHOTO: REUTERS. LONDON (REUTERS) The head of a...,2,15578,"[0, 201154, 12, 9069, 17632, 63175, 5, 6, 1066...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


## Modelling

In [14]:
model = AutoAdapterModel.from_pretrained(language_model, num_labels=6)

task_name = "subtask2"
lang_adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=2)

model.load_adapter("en/wiki@ukp", config=lang_adapter_config)


model.add_adapter(task_name)
model.add_classification_head(task_name, num_labels=6)
model.train_adapter([task_name])

model.active_adapters = Stack("en", task_name)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaAdapterModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaAdapterModel were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for prediction

In [15]:
 
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
early_stop = EarlyStoppingCallback(3)

training_args = TrainingArguments(
    learning_rate=1e-6,
    num_train_epochs=10,
    seed = 42,
    output_dir="./training_output",
    # label_names=["generated", "human"]
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    dataloader_num_workers=32,
    logging_steps=100,
    save_total_limit = 2,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to='tensorboard',
    metric_for_best_model='f1'
)

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=compute_metrics,
    callbacks = [early_stop]
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `XLMRobertaAdapterModel.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `XLMRobertaAdapterModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 20174
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 3160
  Number of trainable parameters = 1489734


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.8015,1.804816,0.158341,0.049132,0.042839,0.166209
2,1.8023,1.7999,0.160125,0.057481,0.052148,0.168113
3,1.7967,1.797295,0.164585,0.066594,0.055175,0.17284
4,1.796,1.795774,0.165031,0.069026,0.05485,0.173327
5,1.7945,1.794926,0.173506,0.080048,0.061462,0.182296
6,1.7939,1.79443,0.178858,0.087479,0.063221,0.188001
7,1.7928,1.794044,0.17975,0.088131,0.063092,0.188948
8,1.7944,1.793806,0.181088,0.088585,0.064115,0.190345
9,1.7933,1.79362,0.181534,0.089324,0.06391,0.190829
10,1.7915,1.793543,0.18198,0.089587,0.063322,0.191307


The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaAdapterModel.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `XLMRobertaAdapterModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2242
  Batch size = 64
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./training_output/checkpoint-316
Configuration saved in ./training_output/checkpoint-316/en/adapter_config.json
Module weights saved in ./training_output/checkpoint-316/en/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-316/subtask2/adapter_config.json
Module weights saved in ./training_output/checkpoint-316/subtask2/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-316/subtask2/head_config.json
Module weights saved in ./training_output/checkpoint-316/subtask2/pytorch_model_head.bin
Configuration saved in ./traini

TrainOutput(global_step=3160, training_loss=1.7960327583023263, metrics={'train_runtime': 370.2587, 'train_samples_per_second': 544.862, 'train_steps_per_second': 8.535, 'total_flos': 9153260321990400.0, 'train_loss': 1.7960327583023263, 'epoch': 10.0})

In [16]:
t_metrics = trainer.evaluate(dataset_test)
t_metrics

The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaAdapterModel.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `XLMRobertaAdapterModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2242
  Batch size = 64


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.793542742729187,
 'eval_accuracy': 0.18198037466547726,
 'eval_f1': 0.08958737324454062,
 'eval_precision': 0.06332167397403946,
 'eval_recall': 0.19130747582471064,
 'eval_runtime': 2.9999,
 'eval_samples_per_second': 747.352,
 'eval_steps_per_second': 12.0,
 'epoch': 10.0}

In [17]:
pd.DataFrame([t_metrics])

Unnamed: 0,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,1.793543,0.18198,0.089587,0.063322,0.191307,2.9999,747.352,12.0,10.0
