In [2]:
!apt-get install git-lfs

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset, DatasetDict, concatenate_datasets
import torch

Constants

In [5]:
DEVICE = 0 if torch.cuda.is_available() else -1
PATH_TO_DATA = Path('../input/detecting-generated-scientific-papers')
PATH = '../input/detecting-generated-scientific-papers'
INDEX_COL_NAME = 'id'
INPUT_COL_NAME = 'text'
TARGET_COL_NAME = 'fake'

Reading & briefly exploring the available data

In [6]:
train_df = pd.read_csv(PATH_TO_DATA / "fake_papers_train_part_public.csv", index_col=INDEX_COL_NAME)

In [7]:
# og_dataset = load_dataset(
#     'csv', 
#     data_files={
#         'train': PATH + '/' + "fake_papers_train_part_public.csv",
#         "test": PATH + '/' + "fake_papers_train_part_public.csv",
#         "validation": PATH + '/' + "fake_papers_train_part_public.csv",
#     },
# )

In [8]:
new_dataset = load_dataset(
    'csv',
    data_files={
        'train': '../input/detectinggeneratedscientificaugmented/real_and_fake_passages_dataset_train.csv',
        'validation': '../input/detectinggeneratedscientificaugmented/real_and_fake_passages_dataset_validation.csv',
        'test': '../input/detectinggeneratedscientificaugmented/real_and_fake_passages_dataset_test.csv'
    }
)

In [9]:
# og_dataset = og_dataset.remove_columns('id')
# def preprocess_tool(examples):
#     examples['tool'] = ['unknown' if fake == 1 else 'real' for fake in examples['fake']]
#     return examples
# og_dataset = og_dataset.map(preprocess_tool, batched=True)

In [10]:
# new_dataset = new_dataset.remove_columns(['Unnamed: 0', 'type', 'model'])
new_dataset = new_dataset.rename_column('passages', 'text')

In [11]:
new_dataset

In [12]:
# new_dataset['train'] = concatenate_datasets([new_dataset['train'], og_dataset['train']])
# new_dataset['test'] = concatenate_datasets([new_dataset['test'], og_dataset['test']])
# new_dataset['validation'] = concatenate_datasets([new_dataset['validation'], og_dataset['validation']])

## Defining the model

In [13]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

In [14]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=256)

tool_labels = {
    "generate": 0,
    "paraphrase": 1,
    "real": 2,
    "translate": 3 
}
def preprocess_label(examples):
    examples['label'] = [tool_labels[tool] for tool in examples['tool']]
    return tokenizer(examples["text"], truncation=True, max_length=256)

# tokenized_dataset = new_dataset.filter(lambda example: example['tool'] == 'unknown' or example['tool'] == 'real').map(preprocess_function, batched=True).rename_column("fake", "label")
tokenized_dataset = new_dataset.filter(lambda example: example['model'] != 'GPT-2-arxiv_generate').map(preprocess_label, batched=True) # 


In [15]:
tokenized_dataset

In [16]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-large", num_labels=4)

In [18]:
from datasets import load_metric

def compute_metrics(eval_preds):
    precision = load_metric("precision")
    recall = load_metric("recall")
    f1 = load_metric("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return {
        **f1.compute(predictions=predictions, references=labels, average='micro'),
        **precision.compute(predictions=predictions, references=labels, average='micro'),
        **recall.compute(predictions=predictions, references=labels, average='micro')
    }

In [19]:
training_args = TrainingArguments(
    "deberta-v3-large-finetuned-syndag-multiclass-not-gpt2-arxiv",
    learning_rate=6e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    warmup_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    push_to_hub=True,
    fp16=True # switch off if not using GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [20]:
trainer.evaluate(tokenized_dataset["validation"].select(range(32)))

In [21]:
trainer.train()

In [22]:
trainer.evaluate(
    tokenized_dataset["test"]
)

In [23]:
trainer.push_to_hub(commit_message="Training complete", tags="text-classification")

In [24]:
ls

In [25]:
ls

In [26]:
ls

In [27]:
ls