In [None]:
# !pip install adapter-transformers
# !pip install datasets
# !pip install evaluate

1. mbert-en
2. mbert-es

In [2]:
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os, sys
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"



import json
from transformers import AutoTokenizer, AdapterConfig, AutoAdapterModel, AutoConfig
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction, TrainerCallback
from transformers import AutoModelForSequenceClassification, Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy, DataCollatorForLanguageModeling
from datasets import Dataset, DatasetDict

from tqdm import tqdm

import numpy as np
from datasets import concatenate_datasets, load_metric
import evaluate
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import random
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df_en = pd.read_csv("../data/subtask_2/en/train.tsv", sep='\t')
df_en=df_en.drop(df_en.columns[0], axis=1)

df_es = pd.read_csv("../data/subtask_2/es/train.tsv", sep='\t')
df_es=df_es.drop(df_es.columns[0], axis=1)

In [4]:
mapping = {
    "A": 0,
    "B": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5
}
df_en["label"] = df_en['label'].map(mapping)
df_es["label"] = df_es['label'].map(mapping)

In [5]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, concatenate_datasets

# Split your data into train and test sets
dataset_train_en, dataset_test_en = train_test_split(df_en, test_size=0.1, random_state=42)
dataset_train_es, dataset_test_es = train_test_split(df_es, test_size=0.1, random_state=42)

# Further split your train data into train and validation sets
dataset_train_en, dataset_valid_en = train_test_split(dataset_train_en, test_size=0.1, random_state=42)
dataset_train_es, dataset_valid_es = train_test_split(dataset_train_es, test_size=0.1, random_state=42)


# Concatenate the datasets
dataset_train_compl = pd.concat([dataset_train_en, dataset_train_es])
dataset_valid_compl = pd.concat([dataset_valid_en, dataset_valid_es])
dataset_test_compl = pd.concat([dataset_test_en, dataset_test_es])

print(len(dataset_train_compl))
print(len(dataset_valid_compl))
print(len(dataset_test_compl))

35922
3993
4436


In [6]:
# language_model = "xlm-roberta-base"
# language_model = "bert-base-multilingual-cased"
# language_model = "microsoft/deberta-v3-base"
# language_model = "prajjwal1/bert-tiny"
language_model = "distilbert-base-cased"
# language_model = "roberta-base-openai-detector"
# language_model = "Hello-SimpleAI/chatgpt-detector-roberta"


tokenizer = AutoTokenizer.from_pretrained(language_model)

dataset_train = Dataset.from_pandas(dataset_train_compl)
dataset_valid = Dataset.from_pandas(dataset_valid_compl)
dataset_test = Dataset.from_pandas(dataset_test_compl)

def encode_batch(batch):
    """Encodes a batch of input data using the model tokenizer."""
    return tokenizer(batch["text"], max_length=80, truncation=True, padding="max_length")


dataset_train = dataset_train.rename_column("label", "labels")
dataset_train = dataset_train.map(encode_batch, batched=True)
dataset_train.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])

dataset_valid = dataset_valid.rename_column("label", "labels")
dataset_valid = dataset_valid.map(encode_batch, batched=True)
dataset_valid.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])

dataset_test = dataset_test.map(encode_batch, batched=True)
dataset_test = dataset_test.rename_column("label", "labels")
dataset_test.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])

                                                                    

In [7]:
dataset_train.to_pandas().head()

Unnamed: 0,text,labels,__index_level_0__,input_ids,attention_mask
0,"For example, you can use verbal cues like sit,...",5,3211,"[101, 1370, 1859, 117, 1128, 1169, 1329, 14093...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,The statement made by Jane Collins MP is compl...,5,14617,"[101, 1109, 4195, 1189, 1118, 4074, 6266, 5478...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,The government defended the arrest of the two ...,4,2316,"[101, 1109, 1433, 7607, 1103, 6040, 1104, 1103...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"When it comes to trimming your beard, you’ll n...",4,14340,"[101, 1332, 1122, 2502, 1106, 13373, 5031, 124...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,the lyrics are good and the vocals of amanda s...,2,20228,"[101, 1103, 4017, 1132, 1363, 1105, 1103, 2172...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


## Modelling

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(language_model, num_labels=len(dataset_train_compl.label.unique()), ignore_mismatched_sizes=True)
  
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
early_stop = EarlyStoppingCallback(3)

training_args = TrainingArguments(
    learning_rate=1e-6,
    num_train_epochs=10,
    seed = 42,
    output_dir="./training_output2",
    # label_names=["generated", "human"]
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    dataloader_num_workers=32,
    logging_steps=100,
    save_total_limit = 2,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to='tensorboard',
    metric_for_best_model='f1'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    compute_metrics=compute_metrics,
    callbacks = [early_stop]
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.wei

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.7834,1.769553,0.218633,0.175065,0.218491,0.210976
2,1.6418,1.621033,0.307538,0.285317,0.293044,0.309535
3,1.5409,1.520524,0.3431,0.312483,0.324523,0.347239
4,1.4704,1.4496,0.366642,0.338108,0.34507,0.369583
5,1.4183,1.404547,0.37716,0.353609,0.355711,0.379416
6,1.3884,1.377936,0.388931,0.36517,0.367038,0.390712
7,1.3763,1.357063,0.396945,0.375644,0.376343,0.398923
8,1.3445,1.344352,0.402204,0.381053,0.380665,0.403761
9,1.3481,1.337501,0.406962,0.385595,0.385745,0.407931
10,1.3414,1.335511,0.40546,0.38162,0.3831,0.406526


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3993
  Batch size = 64
Saving model checkpoint to ./training_output2/checkpoint-562
Configuration saved in ./training_output2/checkpoint-562/config.json
Model weights saved in ./training_output2/checkpoint-562/pytorch_model.bin
Deleting older checkpoint [training_output2/checkpoint-5058] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Eval

TrainOutput(global_step=5620, training_loss=1.4781039906566253, metrics={'train_runtime': 429.5058, 'train_samples_per_second': 836.357, 'train_steps_per_second': 13.085, 'total_flos': 7435677091334400.0, 'train_loss': 1.4781039906566253, 'epoch': 10.0})

In [9]:
dataset_test_en = Dataset.from_pandas(dataset_test_en)
dataset_test_es = Dataset.from_pandas(dataset_test_es)


dataset_test_en = dataset_test_en.map(encode_batch, batched=True)
dataset_test_en = dataset_test_en.rename_column("label", "labels")
dataset_test_en.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])


dataset_test_es = dataset_test_es.map(encode_batch, batched=True)
dataset_test_es = dataset_test_es.rename_column("label", "labels")
dataset_test_es.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])


t_metrics_en = trainer.evaluate(dataset_test_en)
t_metrics_es = trainer.evaluate(dataset_test_es)


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2242
  Batch size = 64


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2194
  Batch size = 64


In [10]:
pd.DataFrame([t_metrics_en])

Unnamed: 0,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,1.314457,0.427297,0.416437,0.416701,0.430305,1.8838,1190.16,19.111,10.0


In [11]:
pd.DataFrame([t_metrics_es])

Unnamed: 0,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,1.364694,0.388332,0.355941,0.362879,0.389152,1.9253,1139.549,18.179,10.0
