In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Upgrading and installing packages

In [None]:
!pip install keras-core --upgrade
!pip install -q keras-nlp
!pip install nltk

#### Loading input data

In [None]:
train_prompt=pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv')
test_essays=pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
train_essays=pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
sample_submission=pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')

#### Peeking input data

In [None]:
train_prompt.head()

In [None]:
test_essays.head()

In [None]:
train_essays.head()

In [None]:
sample_submission.head()

#### Importing libraries

In [None]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import keras_core as keras
import keras_nlp
import seaborn as sns
import matplotlib.pyplot as plt


print("TensorFlow version:", tf.__version__)
print("Keras version:", keras.__version__)
print("KerasNLP version:", keras_nlp.__version__)

#### Text preprocessing required for modeling

In [None]:
# Text Preprocessing
import re
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    #Cleaning the text by replacing unwanted character with space
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    #Spliting the text based on space
    words = text.split()
    # Lowercase and remove non-alphabetic words
    words = [word.lower() for word in words if word.isalpha()] 
    # Remove stop words
    words = [word for word in words if word not in stop_words] 
    return ' '.join(words)

## Cleaning the input text
train_essays['clean_text'] = train_essays['text'].apply(clean_text)

### Code Explanation:

This code preprocesses the input data to make it usable for BERT model. Here's a detailed explanation of each part:

### Split Data:
The given code snippet begins by splitting the data into input features (`texts`) and the dependent variable (`labels`). These are extracted from a DataFrame named `train_essays`.

### Initialize BERT Tokenizer:
The BERT tokenizer (`BertTokenizer.from_pretrained('bert-base-uncased')`) is initialized. This tokenizer will be used to convert the raw text input into tokenized sequences suitable for BERT model input.

### Tokenize Texts:
The tokenizer is applied to the texts using the BERT model (`tokenizer(texts, padding=True, truncation=True, return_tensors='pt')`). This tokenizes the texts, adds padding to ensure uniform length, and truncates long sequences. The result is a dictionary containing the tokenized inputs suitable for the BERT model, returned as PyTorch tensors.

### Convert Labels to Tensors:
The labels are converted into PyTorch tensors (`torch.tensor(labels)`). This step is necessary to ensure compatibility with PyTorch operations.

### Split Data into Train and Validation Sets:
The `train_test_split` function from scikit-learn is used to split the tokenized texts and labels into training and validation sets (`train_test_split(tokenized_texts['input_ids'], labels, test_size=0.2, random_state=42)`). This function divides the dataset into a training set (80% of the data) and a validation set (20% of the data) while maintaining the class distribution.

### Create PyTorch Datasets:
PyTorch `TensorDataset` is created for both the training and validation sets (`TensorDataset(train_texts, train_labels)` and `TensorDataset(val_texts, val_labels)`). This is done to organize the tokenized texts and labels into a format that can be easily consumed by PyTorch's `DataLoader`.

### Create PyTorch Dataloaders:
PyTorch `DataLoader` objects are created for both the training and validation datasets (`DataLoader(train_dataset, batch_size=batch_size, shuffle=True)` and `DataLoader(val_dataset, batch_size=batch_size)`). These DataLoader objects are responsible for batching the data, shuffling (only for training), and loading the data efficiently during training and validation loops.


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

# Split data into input features and dependent variable
texts = train_essays['clean_text'].tolist()  # Extract clean text from DataFrame
labels = train_essays['generated'].tolist()  # Extract labels from DataFrame

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize texts using BERT model
tokenized_texts = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Convert dependent variable to tensor variable
labels = torch.tensor(labels)

# Split data into train and validation sets using train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(tokenized_texts['input_ids'],
                                                                    labels,
                                                                    test_size=0.2,
                                                                    random_state=42)

# Create PyTorch datasets
train_dataset = TensorDataset(train_texts, train_labels)  # Create dataset for training data
val_dataset = TensorDataset(val_texts, val_labels)  # Create dataset for validation data

# Create PyTorch dataloaders
batch_size = 16  # Define batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)  # DataLoader for training data
val_loader = DataLoader(val_dataset, batch_size=batch_size)  # DataLoader for validation data

### Code Explanation:

This code trains a BERT model for sequence classification. Here's a detailed explanation of each part:

### Initialization:
- `best_val_loss` is initialized with a high value (`float('inf')`), and `best_model_params` is initialized as `None` to keep track of the best model parameters.
- The BERT model for sequence classification is initialized using `BertForSequenceClassification.from_pretrained('bert-base-uncased')`.
- AdamW optimizer is instantiated with a learning rate of `5e-5`, and Cross Entropy Loss is defined as the criterion for optimization.

### Training Loop:
- The loop iterates over the specified number of epochs (`epochs`).
- Inside each epoch, the model is set to training mode (`model.train()`).
- Training data is iterated over in batches using tqdm for progress visualization.
- For each batch, the inputs and labels are moved to the appropriate device (GPU or CPU), gradients are zeroed, forward pass is performed, loss is computed, backward pass is performed, and optimizer is updated.
- Training loss and accuracy are calculated and printed.

### Validation Loop:
- After each epoch, the model is set to evaluation mode (`model.eval()`).
- Validation data is iterated over to calculate validation loss and accuracy.
- Gradients are not computed during validation.
- Validation loss and accuracy are calculated and printed.

### Best Model Tracking:
- The code tracks the best model based on validation loss.
- If the current model has a lower validation loss than the previous best model, its parameters are saved as the best model parameters.

### Output:
- The output includes epoch-wise training and validation loss, as well as accuracy.


In [None]:
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, AdamW
from tqdm import tqdm

# Initialize best validation loss with a high value
best_val_loss = float('inf')
# Initialize best model parameters as None
best_model_params = None

# Define the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

# Training loop
## Setting up PyTorch device as GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
epochs = 3

# Loop over each epoch
for epoch in range(epochs):
    ## Training the model
    # Set the model to training mode
    model.train()
    ## Resetting running loss, correct predictions and total predictions for each epoch
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    
    # Iterate over batches in the training DataLoader
    for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}', leave=False):
        # Move inputs and labels to the appropriate device (GPU or CPU)
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        
        # Calculate the loss
        loss = criterion(outputs.logits, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Update running loss
        running_loss += loss.item()
        
        # Calculate and update correct and total predictions for accuracy calculation
        _, predicted = torch.max(outputs.logits, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)
    
    # Calculate average training loss and accuracy for the epoch
    train_loss = running_loss / len(train_loader)
    train_accuracy = correct_predictions / total_predictions
    
    # Validation loop
    # Set the model to evaluation mode
    model.eval()
    val_running_loss = 0.0
    val_correct_predictions = 0
    val_total_predictions = 0
    
    # Iterate over batches in the validation DataLoader
    for inputs, labels in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        # Forward pass without gradient computation
        with torch.no_grad():
            outputs = model(inputs)
            loss = criterion(outputs.logits, labels)
        
        # Update running loss for validation
        val_running_loss += loss.item()
        
        # Calculate and update correct and total predictions for validation accuracy calculation
        _, predicted = torch.max(outputs.logits, 1)
        val_correct_predictions += (predicted == labels).sum().item()
        val_total_predictions += labels.size(0)
    
    # Calculate average validation loss and accuracy for the epoch
    val_loss = val_running_loss / len(val_loader)
    val_accuracy = val_correct_predictions / val_total_predictions
    
    # Check if this is the best model so far based on validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_params = model.state_dict().copy()
        
    # Print epoch-wise training and validation loss, and accuracy
    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')


In [None]:
# ## RESULTS
# Epoch 1/3, Train Loss: 0.0375, Train Accuracy: 0.9973, Val Loss: 0.0224, Val Accuracy: 0.9964
# Epoch 2/3, Train Loss: 0.0147, Train Accuracy: 0.9982, Val Loss: 0.0232, Val Accuracy: 0.9964
# Epoch 3/3, Train Loss: 0.0125, Train Accuracy: 0.9982, Val Loss: 0.0259, Val Accuracy: 0.9964

In [None]:
import torch
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader

# Instantiate the best model using the best parameters
best_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
best_model.load_state_dict(best_model_params)
best_model.eval()

In [None]:
## Applying preprocessing on texts
test_essays['clean_text'] = test_essays['text'].apply(clean_text)
texts = test_essays['clean_text'].tolist()
tokenized_texts = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

In [None]:
# Prediction on test data using the best model
with torch.no_grad():
    outputs = best_model(**tokenized_texts)
    logits = outputs.logits
    
predictions = torch.softmax(logits, dim=1)[:, 1]

In [None]:
# Create a submission DataFrame with essay IDs and corresponding predictions
submission = pd.DataFrame({
    'id': test_essays['id'],
    'generated': predictions
})

In [None]:
# Save the submission DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)