In [None]:
!apt-get install git-lfs

In [None]:
pip install datasets transformers torch numpy sentencepiece

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import datasets
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments

model_checkpoint = "anon/deberta-v3-large-dapt-scientific-papers-pubmed"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
import numpy as np
import torch

text = "This is a great [MASK]."
inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[1, 0]
mask_token_logits = token_logits[0, mask_token_index, :].detach().numpy()
# Pick the [MASK] candidates with the highest logits
# We negate the array before argsort to get the largest, not the smallest, logits
top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist()

for token in top_5_tokens:
    print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")


In [None]:
train_dataset = datasets.load_dataset(
    'csv', 
    data_files={
        'train': "../input/detecting-generated-scientific-papers/fake_papers_train_part_public.csv",
        
    },

).remove_columns(['id', 'fake'])

In [None]:
test_dataset = datasets.load_dataset(
    'csv', 
    data_files={
        'train': "../input/detecting-generated-scientific-papers/fake_papers_test_public.csv",
        
    },

).remove_columns(['id'])

In [None]:
test_dataset

In [None]:
dataset = datasets.concatenate_datasets(
    [test_dataset['train'], train_dataset['train']]
)

In [None]:

def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets

In [None]:
chunk_size = 128
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
train_size = len(lm_datasets)- 1
test_size = int(0.1 * train_size) - 1
print(train_size, test_size)

downsampled_dataset = lm_datasets.train_test_split(
    train_size=train_size - test_size, test_size=test_size, seed=42
)
downsampled_dataset

In [None]:
metric = datasets.load_metric("accuracy")
def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    return logits.argmax(dim=-1)
        
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels, after the argmax(-1) has been calculated
    # by preprocess_logits_for_metrics
    labels = labels.reshape(-1)
    preds = preds.reshape(-1)
    mask = labels != -100
    labels = labels[mask]
    preds = preds[mask]
    return metric.compute(predictions=preds, references=labels)

In [None]:
training_args = TrainingArguments(
    "deberta-v3-large-dapt-scientific-papers-pubmed-tapt",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="no",
    push_to_hub=True,
    fp16=True # switch off if not using GPU
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset['train'],
    eval_dataset=downsampled_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub(commit_message="Training complete", tags="fill-mask")