In [1]:
import os, sys, re, gc, time, tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from datasets import Dataset
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.nn.parallel import DataParallel
import pandas as pd
nltk.download('stopwords')

ModuleNotFoundError: No module named 'seaborn'

In [2]:
# read data depending on whether it is on kaggle, colab or local
if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
    print("Running on Kaggle!")
    kernel = 'kaggle'
    test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
    submission = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
    org_train = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
    extra = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')
elif "google.colab" in sys.modules:
    print("Running on Google Colab!")
    kernel = 'google_colab'
    from google.colab import drive
    drive.mount('/content/drive')
    data_path = '/content/drive/MyDrive/Kaggle/LLM_Detect_AI_Generated_Text/data/'
    test = pd.read_csv(data_path + "test_essays.csv")
    submission = pd.read_csv(data_path + "sample_submission.csv")
    org_train = pd.read_csv(data_path + "train_essays.csv")
    extra = pd.read_csv(data_path + "train_v2_drcat_02.csv", sep=",")
else:
    print("Running locally.")
    kernel = 'local'
    test = pd.read_csv("./data/test_essays.csv")
    submission = pd.read_csv("./data/sample_submission.csv")
    org_train = pd.read_csv("./data/train_essays.csv")
    train = pd.read_csv("./data/train_v2_drcat_02.csv", sep=",")

Running on Kaggle!


In [3]:
# drop duplicates
extra = extra.drop_duplicates(subset=['text'])
extra.reset_index(drop=True, inplace=True)

# Text Preprocessing
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    words = text.split()  # Tokenize
    words = [word.lower() for word in words if word.isalpha()]  # Lowercase and remove non-alphabetic words
    words = [word for word in words if word not in stop_words]  # Remove stop words
    return ' '.join(words)

extra['clean_text'] = extra['text'].apply(clean_text)

In [4]:
# train, val, test set 90%, 10%, 10% respectively
extra = extra.sample(frac=1.0, random_state=42).reset_index(drop=True)
X, y = extra['clean_text'], extra['label']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
print(f"Train set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Train set size: 35894
Validation set size: 4487
Test set size: 4487


In [5]:
tokenizer = BertTokenizer.from_pretrained("eljanmahammadli/bert-base-uncased-llm-detect-ai")
model = BertForSequenceClassification.from_pretrained("eljanmahammadli/bert-base-uncased-llm-detect-ai")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

cuda


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [6]:
from torch.utils.data import DataLoader, TensorDataset

# Assuming your tokenizer and model are already defined and loaded

# Convert test data into a PyTorch Dataset
test_encodings = tokenizer(X_test.tolist(), padding=True, truncation=True, return_tensors='pt')
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'])

# Create a DataLoader for your test set
batch_size = 100  # You can adjust this depending on your GPU memory
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Function to predict in batches
def predict(model, dataloader):
    model.eval()  # Set model to evaluation mode
    predictions = []

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader):
            input_ids, attention_mask = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            batch_predictions = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
            predictions.extend(batch_predictions)

    return predictions

# Generate predictions
predictions = predict(model, test_loader)

In [7]:
from sklearn.metrics import roc_auc_score
test_auc_roc = roc_auc_score(y_test, predictions)
print(f"AUC-ROC on Validation Data: {test_auc_roc:.4f}")

AUC-ROC on Validation Data: 0.9998


In [8]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Convert probabilities to class predictions
# Assuming predictions is a list of probabilities for the positive class
threshold = 0.5
class_predictions = [1 if prob > threshold else 0 for prob in predictions]

# Convert y_test to a list if it is not already
y_test_list = list(y_test)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test_list, class_predictions)

# Calculate the classification report
class_report = classification_report(y_test_list, class_predictions)

print("Confusion Matrix:")
print(conf_matrix)

print("\nClassification Report:")
print(class_report)


Confusion Matrix:
[[2718   19]
 [  11 1739]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      2737
           1       0.99      0.99      0.99      1750

    accuracy                           0.99      4487
   macro avg       0.99      0.99      0.99      4487
weighted avg       0.99      0.99      0.99      4487



In [None]:
# Test data processing
test_inputs = tokenizer(test['text'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Move input tensor to the same device as the model
test_inputs = {key: value.to(device) for key, value in test_inputs.items()}

# Generate predictions using your trained model
with torch.no_grad():
    outputs = model(**test_inputs)
    logits = outputs.logits

# Assuming the first column of logits corresponds to the negative class (non-AI-generated)
# and the second column corresponds to the positive class (AI-generated)
predictions = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()  # Move predictions back to CPU

# Create a submission DataFrame with essay IDs and corresponding predictions
submission = pd.DataFrame({
    'id': test['id'],
    'generated': predictions
})


submission.to_csv('/kaggle/working/submission.csv', index=False)
print(submission)

In [None]:
#  del model
# del test_inputs
# torch.cuda.empty_cache()
gc.collect()