In [None]:
import datasets
import pandas as pd
from datasets import Dataset
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, concatenate_datasets
import torch
import os
import wandb
wandb.init(project="kaggle-ai-detection")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def train_roberta():
    roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base").to(device)
    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

    # training dataset

    train_df = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")
    dataset_list = []
        
    train_dataset = Dataset.from_pandas(train_df)
    train_val_dataset = train_dataset.train_test_split(test_size=0.2)

    tokenized_dataset = train_val_dataset.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True), batched=True)

    tokenized_dataset = tokenized_dataset.remove_columns(["prompt_name","source","text", "RDizzl3_seven"])

    #tokenized_dataset = tokenized_dataset.rename_column("generated", "labels")

    training_args = TrainingArguments(
        learning_rate=wandb.config.learning_rate,
        per_device_train_batch_size=wandb.config.batch_size,
        per_device_eval_batch_size=wandb.config.batch_size,
        num_train_epochs=wandb.config.epochs,
        weight_decay=wandb.config.weight_decay,
        load_best_model_at_end=True,
        output_dir='./results',
        logging_dir='./logs',
        logging_steps=100,
        report_to="wandb"
    )

    trainer = Trainer(
        model=roberta_model,                         # the instantiated 🤗 Transformers model to be trained
        tokenizer=tokenizer,                         # the instantiated 🤗 Transformers tokenizer to be trained
        args=training_args,                       # training arguments, defined above
        train_dataset=tokenized_dataset["train"],         # training dataset
        eval_dataset=tokenized_dataset["test"]             # evaluation dataset
    )

    trainer.train()
    
    metrics = trainer.evaluate()
    
    wandb.log(metrics)

    # Close the wandb run
    wandb.finish()
    
    os.makedirs("/kaggle/working/finetuned_roberta_daigt/", exist_ok=True)

    trainer.save_model("/kaggle/working/finetuned_roberta_daigt/")
    tokenizer.save_pretrained("/kaggle/working/finetuned_roberta_daigt/")

In [None]:
sweep_config = {
    'method': 'bayes',  # can be grid, random, bayes
    'metric': {
        'name': 'accuracy',
        'goal': 'maximize'   
    },
    'parameters': {
        'learning_rate': {
            'min': 1e-5,
            'max': 5e-4
        },
        'batch_size': {
            'values': [4, 8, 16]
        },
        "epoch":{
            "values": [1, 2, 3]
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project="kaggle-ai-detection")

In [None]:
wandb.agent(sweep_id, train_roberta)