In [1]:
# !pip install adapter-transformers
# !pip install datasets
# !pip install evaluate
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os, sys
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"



import json
from transformers import AutoTokenizer, AdapterConfig, AutoAdapterModel, AutoConfig
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction, TrainerCallback
from transformers import AutoModelForSequenceClassification, Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy, DataCollatorForLanguageModeling
from datasets import Dataset, DatasetDict

from tqdm import tqdm

import numpy as np
from datasets import concatenate_datasets, load_metric
import evaluate
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import random
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv("../data/subtask_2/es/train.tsv", sep='\t')
df=df.drop(df.columns[0], axis=1)
df

Unnamed: 0,text,label
0,"Sin embargo, los jóvenes son capaces de recono...",B
1,¿Hay algo más que quieras compartir? ¿Algo sob...,B
2,"El servicio de sala es bueno, rápido y amabilí...",B
3,"Para concentrarse en el hablante, trata de des...",F
4,Los responsables locales tendrán ahora que esp...,F
...,...,...
21930,Escribe un correo electrónico a PI:EMAIL y cué...,A
21931,"Los osos negros y grizzlies de Yellowstone, ta...",C
21932,"El lenguaje es un recurso para comunicarse, pe...",A
21933,"Dr, creo que las ostras son animales No, las o...",F


In [4]:
mapping = {
    "A": 0,
    "B": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5
}
df["label"] = df['label'].map(mapping)

In [5]:
from sklearn.model_selection import train_test_split

dataset_train, dataset_test = train_test_split(df, test_size=0.1, random_state=42)

print(len(dataset_train))
print(len(dataset_test))

19741
2194


In [6]:
dataset_train.head()

Unnamed: 0,text,label
20976,Volveremos sin duda. Una experiencia genial! L...,1
14568,"Si es posible, retira un poco de agua antes de...",2
4057,"Hay banjos tradicionales, como el banjo de cue...",4
19020,@ElDatoDelDia: Brote de ébola en Guinea es el ...,2
3383,El derecho definitivo aplicable será el aplica...,2


In [7]:
dataset_test.head()

Unnamed: 0,text,label
6423,Los jóvenes tienen más probabilidades de encon...,2
16798,En un plazo no superior a seis meses después d...,2
4205,Además no nos dieron la habitación que habíamo...,0
5833,"Es una gran opción en el pueblo, pero el perso...",3
2950,Pero no ocurre lo mismo con los perros: si se ...,3


In [8]:
# language_model = "xlm-roberta-base"
# language_model = "bert-base-multilingual-cased"
# language_model = "microsoft/deberta-v3-base"
# language_model = "prajjwal1/bert-tiny"
# language_model = "distilbert-base-cased"
# language_model = "roberta-base-openai-detector"
language_model = "Hello-SimpleAI/chatgpt-detector-roberta"



tokenizer = AutoTokenizer.from_pretrained(language_model, num_labels=len(df.label.unique()))

dataset_train = Dataset.from_pandas(dataset_train)
dataset_test = Dataset.from_pandas(dataset_test)

def encode_batch(batch):
    """Encodes a batch of input data using the model tokenizer."""
    return tokenizer(batch["text"], max_length=80, truncation=True, padding="max_length")


dataset_train = dataset_train.rename_column("label", "labels")
dataset_train = dataset_train.map(encode_batch, batched=True)
dataset_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

dataset_test = dataset_test.map(encode_batch, batched=True)
dataset_test = dataset_test.rename_column("label", "labels")
dataset_test.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

                                                                    

In [9]:
dataset_train.to_pandas().head()

Unnamed: 0,text,labels,__index_level_0__,input_ids,attention_mask
0,Volveremos sin duda. Una experiencia genial! L...,1,20976,"[0, 846, 18224, 5593, 366, 10272, 385, 10876, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"Si es posible, retira un poco de agua antes de...",2,14568,"[0, 35684, 2714, 8593, 4748, 6, 5494, 3578, 54...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"Hay banjos tradicionales, como el banjo de cue...",4,4057,"[0, 33203, 2020, 267, 366, 26916, 636, 6073, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,@ElDatoDelDia: Brote de ébola en Guinea es el ...,2,19020,"[0, 1039, 9682, 495, 3938, 21502, 495, 493, 35...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,El derecho definitivo aplicable será el aplica...,2,3383,"[0, 9682, 25872, 11156, 32888, 405, 9697, 10, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


## Modelling

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(language_model, num_labels=len(df.label.unique()), ignore_mismatched_sizes=True)
  
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
early_stop = EarlyStoppingCallback(3)

training_args = TrainingArguments(
    learning_rate=1e-6,
    num_train_epochs=10,
    seed = 42,
    output_dir="./training_output3",
    # label_names=["generated", "human"]
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    dataloader_num_workers=32,
    logging_steps=100,
    save_total_limit = 2,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to='tensorboard',
    metric_for_best_model='f1'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=compute_metrics,
    callbacks = [early_stop]
)

trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at Hello-SimpleAI/chatgpt-detector-roberta and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 19741
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size 

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.7639,1.746975,0.251139,0.160208,0.163153,0.244843
2,1.695,1.700711,0.249772,0.174438,0.190985,0.244304
3,1.6406,1.645853,0.267092,0.205895,0.264426,0.273212
4,1.6005,1.609192,0.2835,0.22887,0.275551,0.289975
5,1.5873,1.570315,0.299453,0.24558,0.288137,0.306174
6,1.5523,1.576238,0.291249,0.246807,0.287964,0.297341
7,1.5349,1.56831,0.294439,0.247815,0.309086,0.300326
8,1.5367,1.552694,0.299453,0.248137,0.253434,0.305975
9,1.5208,1.552532,0.302188,0.255362,0.262381,0.308817
10,1.5122,1.543097,0.302188,0.252523,0.256454,0.308902


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2194
  Batch size = 64
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./training_output3/checkpoint-309
Configuration saved in ./training_output3/checkpoint-309/config.json
Model weights saved in ./training_output3/checkpoint-309/pytorch_model.bin
Deleting older checkpoint [training_output3/checkpoint-2781] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can saf

TrainOutput(global_step=3090, training_loss=1.6007488213696526, metrics={'train_runtime': 467.3323, 'train_samples_per_second': 422.419, 'train_steps_per_second': 6.612, 'total_flos': 8116034196700800.0, 'train_loss': 1.6007488213696526, 'epoch': 10.0})

In [11]:
t_metrics = trainer.evaluate(dataset_test)
t_metrics

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2194
  Batch size = 64


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.5525318384170532,
 'eval_accuracy': 0.30218778486782133,
 'eval_f1': 0.25536165770451796,
 'eval_precision': 0.26238085091953023,
 'eval_recall': 0.30881724936030847,
 'eval_runtime': 2.5685,
 'eval_samples_per_second': 854.198,
 'eval_steps_per_second': 13.627,
 'epoch': 10.0}

In [12]:
pd.DataFrame([t_metrics])

Unnamed: 0,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,1.552532,0.302188,0.255362,0.262381,0.308817,2.5685,854.198,13.627,10.0
