In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv


# Install Dependencies

In [2]:
# # You'll need the HuggingFace Transformers library for model fine-tuning, and Torch for the underlying model operations.

# !pip install transformers
# !pip install torch
# !pip install scikit-learn
# !pip install datasets


# Import Necessary Libraries

In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import softmax
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)


In [4]:
import os

os.environ["WANDB_DISABLED"] = "true"


# Load and Prepare the Dataset

In [5]:
# We will convert the data into a format compatible with HuggingFace's Trainer API.

# Load the training data
train_data = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')

# Preprocess the text (basic cleaning)
def preprocess_text(text):
    return text.lower()

train_data['text'] = train_data['text'].apply(preprocess_text)

# Split data into training and validation sets
train_texts = train_data['text'].tolist()
train_labels = train_data['generated'].tolist()

train_size = int(0.8 * len(train_texts))
train_texts, val_texts = train_texts[:train_size], train_texts[train_size:]
train_labels, val_labels = train_labels[:train_size], train_labels[train_size:]


# Tokenize the Data

In [6]:
# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tokens = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in tokens.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item


# Model Selection

In [7]:
# Initialize tokenizer
model_name = 'microsoft/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_len = 768
# max_len = 512

# Prepare tokenized datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_len)

# Model Selection
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)
# model.gradient_checkpointing_enable()

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Define Trainer and Training Arguments

In [9]:
# Define Trainer and Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    learning_rate=3e-5,
    per_device_train_batch_size=4, # Reduced from 16
    per_device_eval_batch_size=4,  # Reduced from 16
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    gradient_accumulation_steps=4,  # Increase to compensate for reduced batch size
    fp16=True,            # Mixed precision for memory efficiency
    dataloader_num_workers=4,
    seed=42,
    report_to=None,       # Disable Weights & Biases logging
    optim="adamw_torch",  # Use AdamW Torch optimizer
    torch_compile=True,   # Enable Torch compilation
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)


The speedups for torchdynamo mostly come wih GPU Ampere or higher and which is not detected here.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


# Fine-tune the Model

In [10]:
trainer.train()




Step,Training Loss,Validation Loss


TrainOutput(global_step=340, training_loss=0.034912816566579484, metrics={'train_runtime': 1551.1444, 'train_samples_per_second': 7.104, 'train_steps_per_second': 0.219, 'total_flos': 4230112297826304.0, 'train_loss': 0.034912816566579484, 'epoch': 9.72463768115942})

# Evaluation

In [11]:
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)




Evaluation Results: {'eval_loss': 0.07502565532922745, 'eval_runtime': 16.2641, 'eval_samples_per_second': 16.97, 'eval_steps_per_second': 2.152, 'epoch': 9.72463768115942}


# Make Predictions on Test Set

In [12]:
# Once your model is fine-tuned, you can use it to make predictions on the test set:

from scipy.special import expit  # Sigmoid function

# Load and preprocess test data
test_data = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test_data['text'] = test_data['text'].apply(preprocess_text)

# Convert test data to a dataset
test_texts = test_data['text'].tolist()
test_ids = test_data['id'].tolist()
test_dataset = CustomDataset(test_texts, None, tokenizer, max_len)

# Create a DataLoader for batch processing
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Initialize Trainer
trainer = Trainer(model=model, tokenizer=tokenizer)

# Make predictions
predictions = trainer.predict(test_dataset)
# Apply sigmoid to get probabilities
probs = expit(predictions.predictions[:, 1])

# Normalize predictions if required
# normalized_probs = pd.Series(probs).rank(method='min') / len(probs)  # Rank normalization
# Use normalized_probs if normalization is needed
submission = pd.DataFrame({'id': test_ids, 'generated': probs})
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")


  trainer = Trainer(model=model, tokenizer=tokenizer)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Submission file created successfully!


In [13]:
model.save_pretrained('/kaggle/working/deberta_model')
tokenizer.save_pretrained('/kaggle/working/deberta_tokenizer')


('/kaggle/working/deberta_tokenizer/tokenizer_config.json',
 '/kaggle/working/deberta_tokenizer/special_tokens_map.json',
 '/kaggle/working/deberta_tokenizer/spm.model',
 '/kaggle/working/deberta_tokenizer/added_tokens.json',
 '/kaggle/working/deberta_tokenizer/tokenizer.json')