In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch

In [3]:
# Load the data
train_essays = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
train_prompts = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv')
test_essays = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

In [4]:
# Merge the essays with their prompts
train_data = pd.merge(train_essays, train_prompts, on='prompt_id', how='left')

In [5]:
# Preprocess the text data
# Add any additional preprocessing steps as needed (e.g., stemming, lemmatization)
train_data['processed_text'] = train_data['text'].apply(lambda x: x.lower())
test_essays['processed_text'] = test_essays['text'].apply(lambda x: x.lower())

In [6]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_data['processed_text'],
    train_data['generated'],
    test_size=0.2,
    random_state=42
)

In [7]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Tokenize and encode the training data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128, return_tensors='pt')
train_labels = torch.tensor(list(y_train))

In [9]:
# Tokenize and encode the validation data
val_encodings = tokenizer(list(X_val), truncation=True, padding=True, max_length=128, return_tensors='pt')
val_labels = torch.tensor(list(y_val))

In [10]:
# Create a PyTorch dataset
class EssaysDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, self.labels[idx]

    def __len__(self):
        return len(self.labels)

train_dataset = EssaysDataset(train_encodings, train_labels)
val_dataset = EssaysDataset(val_encodings, val_labels)

In [11]:
# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)



In [12]:
# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, self.labels[idx]
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, self.labels[idx]
Epoch 1/3: 100%|██████████| 138/138 [11:50<00:00,  5.15s/it]
Epoch 2/3: 100%|██████████| 138/138 [11:47<00:00,  5.12s/it]
Epoch 3/3: 100%|██████████| 138/138 [11:45<00:00,  5.11s/it]


In [13]:
# Evaluate on the validation set
model.eval()
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
val_preds = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Evaluating"):
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        val_preds.extend(preds.cpu().numpy())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, self.labels[idx]
Evaluating: 100%|██████████| 35/35 [00:54<00:00,  1.57s/it]


In [14]:
# Evaluate the model
accuracy = accuracy_score(y_val, val_preds)
print(f'Accuracy on validation set: {accuracy:.2f}')

Accuracy on validation set: 1.00


In [15]:
# Display classification report
print(classification_report(y_val, val_preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       0.00      0.00      0.00         1

    accuracy                           1.00       276
   macro avg       0.50      0.50      0.50       276
weighted avg       0.99      1.00      0.99       276



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# Make predictions on the hidden test set
test_encodings = tokenizer(list(test_essays['processed_text']), truncation=True, padding=True, max_length=128, return_tensors='pt')
test_dataset = EssaysDataset(test_encodings, torch.zeros(len(test_essays)))  # Dummy labels for prediction
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
test_preds = []

In [17]:
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Making Predictions"):
        inputs, _ = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, self.labels[idx]
Making Predictions: 100%|██████████| 1/1 [00:00<00:00, 11.93it/s]


In [18]:
# Create a submission file
submission_df = pd.DataFrame({'id': test_essays['id'], 'generated': test_preds})
submission_df.to_csv('bert_submission.csv', index=False)