In [None]:
# !pip install adapter-transformers
# !pip install datasets
# !pip install evaluate
# !pip install sentencepiece

1. mbert-en
2. mbert-es

In [None]:
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os, sys
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"



import json
from transformers import AutoTokenizer, AdapterConfig, AutoAdapterModel, AutoConfig
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction, TrainerCallback
from transformers import AutoModelForSequenceClassification, Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy, DataCollatorForLanguageModeling
from datasets import Dataset, DatasetDict

from tqdm import tqdm

import numpy as np
from datasets import concatenate_datasets, load_metric
import evaluate
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import random
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seed(42)

In [None]:
df_en = pd.read_csv("../data/subtask_1/en/train.tsv", sep='\t')
df_en=df_en.drop(df_en.columns[0], axis=1)

df_es = pd.read_csv("../data/subtask_1/es/train.tsv", sep='\t')
df_es=df_es.drop(df_es.columns[0], axis=1)

In [None]:
mapping = {
    "generated":0,
    "human":1
}
df_en["label"] = df_en['label'].map(mapping)
df_es["label"] = df_es['label'].map(mapping)

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, concatenate_datasets

# Split your data into train and test sets
dataset_train_en, dataset_test_en = train_test_split(df_en, test_size=0.1, random_state=42)
dataset_train_es, dataset_test_es = train_test_split(df_es, test_size=0.1, random_state=42)

# Further split your train data into train and validation sets
dataset_train_en, dataset_valid_en = train_test_split(dataset_train_en, test_size=0.1, random_state=42)
dataset_train_es, dataset_valid_es = train_test_split(dataset_train_es, test_size=0.1, random_state=42)




In [None]:
dataset_train_en

In [None]:
shots = [200, 400, 600, 800, 1000]

list_train_datasets = {}

for shot in shots:
    # Concatenate the datasets
    dataset_train_compl = pd.concat([dataset_train_en.iloc[:(shot//2)], dataset_train_es.iloc[:(shot//2)]])
    
    list_train_datasets[str(shot)] = dataset_train_compl
    
dataset_valid_compl = pd.concat([dataset_valid_en, dataset_valid_es])
dataset_test_compl = pd.concat([dataset_test_en, dataset_test_es])

In [8]:
# language_model = "xlm-roberta-base"
# language_model = "bert-base-multilingual-cased"
# language_model = "microsoft/deberta-v3-base"
# language_model = "prajjwal1/bert-tiny"
language_model = "distilbert-base-cased"
# language_model = "roberta-base-openai-detector"
# language_model = "Hello-SimpleAI/chatgpt-detector-roberta"




dataset_test_en = Dataset.from_pandas(dataset_test_en)
dataset_test_es = Dataset.from_pandas(dataset_test_es)


tokenizer = AutoTokenizer.from_pretrained(language_model)


def encode_batch(batch):
    """Encodes a batch of input data using the model tokenizer."""
    return tokenizer(batch["text"], max_length=80, truncation=True, padding="max_length")

dataset_test_en = dataset_test_en.map(encode_batch, batched=True)
dataset_test_en = dataset_test_en.rename_column("label", "labels")
dataset_test_en.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])


dataset_test_es = dataset_test_es.map(encode_batch, batched=True)
dataset_test_es = dataset_test_es.rename_column("label", "labels")
dataset_test_es.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])



t_metrics_en = {}
t_metrics_es = {}


for shot in shots:

    dataset_train = Dataset.from_pandas(list_train_datasets[str(shot)])
    dataset_valid = Dataset.from_pandas(dataset_valid_compl)



    dataset_train = dataset_train.rename_column("label", "labels")
    dataset_train = dataset_train.map(encode_batch, batched=True)
    dataset_train.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])

    dataset_valid = dataset_valid.rename_column("label", "labels")
    dataset_valid = dataset_valid.map(encode_batch, batched=True)
    dataset_valid.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])



    model = AutoModelForSequenceClassification.from_pretrained(language_model, num_labels=len(dataset_train_compl.label.unique()), ignore_mismatched_sizes=True)

    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
        acc = accuracy_score(labels, preds)
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

    # data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
    early_stop = EarlyStoppingCallback(3)

    training_args = TrainingArguments(
        learning_rate=1e-5,
        num_train_epochs=10,
        seed = 42,
        output_dir="./training_output_multilingual1",
        # label_names=["generated", "human"]
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        dataloader_num_workers=32,
        logging_steps=100,
        save_total_limit = 2,
        overwrite_output_dir=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to='tensorboard',
        metric_for_best_model='f1'
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset_train,
        eval_dataset=dataset_valid,
        compute_metrics=compute_metrics,
        callbacks = [early_stop]
    )

    trainer.train()



    t_metrics_en[str(shot)] = trainer.evaluate(dataset_test_en)
    t_metrics_es[str(shot)] = trainer.evaluate(dataset_test_es)

    print(t_metrics_en)
    print(t_metrics_es)
pd.DataFrame([t_metrics]).to_csv("./result/fewshot_subtask1.csv")

KeyboardInterrupt: 

In [None]:
pd.DataFrame(t_metrics_en).transpose().to_csv("./result/fewshot_subtask1_en.csv")
pd.DataFrame(t_metrics_es).transpose().to_csv("./result/fewshot_subtask1_es.csv")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Your data
data = {
    '200': {'eval_loss': 0.6489814519882202, 'eval_accuracy': 0.5893648449039882, 'eval_f1': 0.561100837808936, 'eval_precision': 0.6428100812447772, 'eval_recall': 0.6002622604395458, 'eval_runtime': 3.0513, 'eval_samples_per_second': 1109.354, 'eval_steps_per_second': 17.37, 'epoch': 10.0}, 
    '400': {'eval_loss': 0.6302704215049744, 'eval_accuracy': 0.6307237813884786, 'eval_f1': 0.6292558146707539, 'eval_precision': 0.6377239599258695, 'eval_recall': 0.6341751820019621, 'eval_runtime': 3.0594, 'eval_samples_per_second': 1106.424, 'eval_steps_per_second': 17.324, 'epoch': 10.0}, 
    '600': {'eval_loss': 0.5848458409309387, 'eval_accuracy': 0.7028064992614476, 'eval_f1': 0.7015572996382906, 'eval_precision': 0.7121572909682412, 'eval_recall': 0.7064391039021248, 'eval_runtime': 3.2098, 'eval_samples_per_second': 1054.594, 'eval_steps_per_second': 16.512, 'epoch': 10.0}, 
    '800': {'eval_loss': 0.5408483147621155, 'eval_accuracy': 0.7290989660265879, 'eval_f1': 0.7290300069600915, 'eval_precision': 0.731671985897722, 'eval_recall': 0.7308540439014396, 'eval_runtime': 3.1087, 'eval_samples_per_second': 1088.872, 'eval_steps_per_second': 17.049, 'epoch': 10.0}, 
    '1000': {'eval_loss': 0.5084332227706909, 'eval_accuracy': 0.7521418020679468, 'eval_f1': 0.7520101263161185, 'eval_precision': 0.75568278523995, 'eval_recall': 0.7542115159639373, 'eval_runtime': 3.1433, 'eval_samples_per_second': 1076.91, 'eval_steps_per_second': 16.862, 'epoch': 10.0}
}

# Extract 'eval_f1' data and create a pandas DataFrame
f1_scores = {k: v['eval_f1'] for k, v in data.items()}
df = pd.DataFrame(list(f1_scores.items()), columns=['Shots', 'F1 Score'])

# Convert 'Shots' column to numeric
df['Shots'] = pd.to_numeric(df['Shots'])

# Plot line chart
df.plot(x='Shots', y='F1 Score', kind='line', marker='o')
plt.title('F1 Score vs. Shots')
plt.xlabel('Shots')
plt.ylabel('F1 Score')
# plt.grid(True)
plt.show()


## Modelling

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained(language_model, num_labels=len(dataset_train_compl.label.unique()), ignore_mismatched_sizes=True)
  
# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
#     acc = accuracy_score(labels, preds)
#     return {
#         'accuracy': acc,
#         'f1': f1,
#         'precision': precision,
#         'recall': recall
#     }

# # data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
# early_stop = EarlyStoppingCallback(3)

# training_args = TrainingArguments(
#     learning_rate=1e-6,
#     num_train_epochs=10,
#     seed = 42,
#     output_dir="./training_output_multilingual1",
#     # label_names=["generated", "human"]
#     per_device_train_batch_size=64,
#     per_device_eval_batch_size=64,
#     dataloader_num_workers=32,
#     logging_steps=100,
#     save_total_limit = 2,
#     overwrite_output_dir=True,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
#     report_to='tensorboard',
#     metric_for_best_model='f1'
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=dataset_train,
#     eval_dataset=dataset_valid,
#     compute_metrics=compute_metrics,
#     callbacks = [early_stop]
# )

# trainer.train()

In [None]:
# dataset_test_en = Dataset.from_pandas(dataset_test_en)
# dataset_test_es = Dataset.from_pandas(dataset_test_es)


# dataset_test_en = dataset_test_en.map(encode_batch, batched=True)
# dataset_test_en = dataset_test_en.rename_column("label", "labels")
# dataset_test_en.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])


# dataset_test_es = dataset_test_es.map(encode_batch, batched=True)
# dataset_test_es = dataset_test_es.rename_column("label", "labels")
# dataset_test_es.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])


# t_metrics_en = trainer.evaluate(dataset_test_en)
# t_metrics_es = trainer.evaluate(dataset_test_es)


In [None]:
# pd.DataFrame([t_metrics_en])

In [None]:
# pd.DataFrame([t_metrics_es])