In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re
from torch.optim import AdamW
import os # Import os for checking file existence

# --- Configuration ---
# Set the maximum sequence length for BERT tokenizer. 128 is a common choice.
MAX_LEN = 128
# Set the batch size for DataLoader. Larger batches can train faster but use more memory.
BATCH_SIZE = 32
# Number of training epochs. More epochs can lead to better performance but risk overfitting.
EPOCHS = 3
# Learning rate for the AdamW optimizer. A common range is 1e-5 to 5e-5 for BERT fine-tuning.
LEARNING_RATE = 2e-5

# --- Data Loading ---
# Define file paths for training and testing data
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'
OUTPUT_FILE = 'test_with_predictions.csv'

# Load the train and test datasets. Use a try-except block for robust file loading.
try:
    train_df = pd.read_csv(TRAIN_FILE)
    test_df = pd.read_csv(TEST_FILE)
    print(f"Successfully loaded '{TRAIN_FILE}' and '{TEST_FILE}'.")
except FileNotFoundError as e:
    print(f"Error: One or both of the CSV files not found. Please ensure '{TRAIN_FILE}' and '{TEST_FILE}' are in the same directory.")
    print(f"Details: {e}")
    exit() # Exit the script if essential files are missing
except Exception as e:
    print(f"An unexpected error occurred during file loading: {e}")
    exit()

# --- Data Cleaning Function ---
# This function cleans text by removing special characters and extra spaces.
def clean_text(text):
    """
    Cleans the input text by:
    1. Removing specified special characters (*@#$%).
    2. Replacing multiple spaces with a single space.
    3. Stripping leading/trailing whitespace.
    """
    text = str(text) # Ensure text is a string
    text = re.sub(r'[*@#$%%]+', '', text) # Remove specified special characters
    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with single space and strip
    return text

# Apply the cleaning function to the 'Text' column of both dataframes
print("Cleaning text data...")
train_df['Text'] = train_df['Text'].apply(clean_text)
test_df['Text'] = test_df['Text'].apply(clean_text)
print("Text cleaning complete.")

# --- Label Encoding ---
# Initialize LabelEncoder to convert categorical 'Subject' labels into numerical format.
label_encoder = LabelEncoder()
print("Encoding subject labels...")
train_df['Subject_encoded'] = label_encoder.fit_transform(train_df['Subject'])
# Determine the number of unique labels, which is required for the BERT model's output layer.
num_labels = len(np.unique(train_df['Subject_encoded']))
print(f"Found {num_labels} unique subjects: {list(label_encoder.classes_)}")
print("Label encoding complete.")

# --- Split Training Data for Validation ---
# Split the training data into training and validation sets.
# stratify ensures that the proportion of labels is the same in both train and validation sets.
print("Splitting data into training and validation sets...")
X_train, X_val, y_train, y_val = train_test_split(
    train_df['Text'], train_df['Subject_encoded'],
    test_size=0.1, # 10% of the data will be used for validation
    random_state=42, # For reproducibility
    stratify=train_df['Subject_encoded'] # Maintain class distribution
)
print(f"Training set size: {len(X_train)} samples")
print(f"Validation set size: {len(X_val)} samples")

# --- BERT Tokenizer ---
# Load the pre-trained BERT tokenizer. 'bert-base-uncased' is a popular choice for English text.
print("Loading BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print("Tokenizer loaded.")

# --- Tokenization Function ---
# This function tokenizes text data into input IDs and attention masks, and optionally labels.
def tokenize_data(texts, labels=None, max_len=MAX_LEN):
    """
    Tokenizes a list of texts using the pre-trained BERT tokenizer.

    Args:
        texts (pd.Series): A pandas Series containing the texts to tokenize.
        labels (pd.Series, optional): A pandas Series containing the numerical labels.
                                      If provided, labels are converted to torch.tensor.
        max_len (int): The maximum sequence length for tokenization.

    Returns:
        tuple: (input_ids, attention_masks) if labels is None,
               (input_ids, attention_masks, labels) otherwise.
    """
    input_ids = []
    attention_masks = []

    print(f"Tokenizing {len(texts)} samples with max_len={max_len}...")
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,      # Add '[CLS]' and '[SEP]'
            max_length=max_len,           # Pad/truncate to `max_len`
            padding='max_length',         # Pad to `max_len`
            return_attention_mask=True,   # Return attention mask
            return_tensors='pt',          # Return PyTorch tensors
            truncation=True               # Explicitly truncate longer sequences
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    if labels is not None:
        labels = torch.tensor(labels.values) # Convert pandas Series to numpy array then to tensor
        print("Tokenization complete (with labels).")
        return input_ids, attention_masks, labels
    else:
        print("Tokenization complete (without labels).")
        return input_ids, attention_masks

# Tokenize the training, validation, and test datasets
train_inputs, train_masks, train_labels = tokenize_data(X_train, y_train)
val_inputs, val_masks, val_labels = tokenize_data(X_val, y_val)
test_inputs, test_masks = tokenize_data(test_df['Text'])

# --- Custom PyTorch Dataset Class ---
# A custom Dataset class is needed to work with PyTorch's DataLoader.
class TextDataset(Dataset):
    """
    Custom PyTorch Dataset for handling tokenized text data.
    """
    def __init__(self, input_ids, attention_masks, labels=None):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        # Returns the total number of samples in the dataset
        return len(self.input_ids)

    def __getitem__(self, idx):
        # Returns a sample from the dataset at the specified index
        if self.labels is not None:
            return self.input_ids[idx], self.attention_masks[idx], self.labels[idx]
        else:
            return self.input_ids[idx], self.attention_masks[idx]

# Create instances of the TextDataset for training, validation, and testing
train_dataset = TextDataset(train_inputs, train_masks, train_labels)
val_dataset = TextDataset(val_inputs, val_masks, val_labels)
test_dataset = TextDataset(test_inputs, test_masks)

# --- DataLoader Creation ---
# DataLoaders wrap the datasets and provide iterators that allow easy access to batches of data.
print(f"Creating DataLoaders with batch_size={BATCH_SIZE}...")
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False) # No need to shuffle validation data
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False) # No need to shuffle test data
print("DataLoaders created.")

# --- Model Loading and Device Setup ---
# Set the device to GPU if available, otherwise CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the pre-trained BERT model for sequence classification.
# The num_labels argument tells the model how many output classes to expect.
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,             # The number of output labels
    output_attentions=False,           # Whether the model returns attentions weights.
    output_hidden_states=False,        # Whether the model returns all hidden-states.
).to(device) # Move the model to the selected device (GPU/CPU)
print("BERT model loaded and moved to device.")

# --- Optimizer Setup ---
# Initialize the AdamW optimizer. AdamW is recommended for BERT fine-tuning.
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
print(f"Optimizer initialized with learning rate: {LEARNING_RATE}")

# --- Training Loop ---
print("\nStarting training process...")
for epoch in range(EPOCHS):
    print(f"\n===== Epoch {epoch + 1}/{EPOCHS} =====")

    # Set model to training mode
    model.train()
    total_loss = 0

    # Iterate over batches from the training DataLoader
    for step, batch in enumerate(train_dataloader):
        # Progress update
        if step % 100 == 0 and not step == 0:
            print(f'  Batch {step} of {len(train_dataloader)}.')

        # Unpack the batch and move tensors to the configured device
        b_input_ids, b_input_mask, b_labels = [b.to(device) for b in batch]

        # Clear any previously calculated gradients
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch)
        # The 'labels' argument calculates the loss internally.
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        # Get the loss from the model's outputs
        loss = outputs.loss
        total_loss += loss.item() # Accumulate the loss

        # Perform a backward pass to calculate gradients
        loss.backward()

        # Clip the norm of the gradients to prevent "exploding gradients"
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update model parameters
        optimizer.step()

    # Calculate the average training loss for the current epoch
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"\nAverage training loss: {avg_train_loss:.4f}")

    # --- Validation Phase ---
    print("Running Validation...")
    # Set model to evaluation mode
    model.eval()
    val_preds = []
    val_true = []

    # Disable gradient calculation for validation (saves memory and speeds up computation)
    with torch.no_grad():
        for batch in val_dataloader:
            b_input_ids, b_input_mask, b_labels = [b.to(device) for b in batch]

            # Perform a forward pass to get logits
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)
            logits = outputs.logits

            # Move logits and labels to CPU to compute predictions
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Get predicted labels (index of the highest logit)
            preds = np.argmax(logits, axis=1).flatten()

            val_preds.extend(preds)
            val_true.extend(label_ids)

    # Calculate validation accuracy
    val_accuracy = np.sum(np.array(val_preds) == np.array(val_true)) / len(val_true)
    print(f"Validation Accuracy: {val_accuracy:.4f}")

print("\nTraining complete. Predicting on test data...")

# --- Prediction on Test Data ---
# Set model to evaluation mode
model.eval()
test_preds = []

# Disable gradient calculation for prediction
with torch.no_grad():
    for step, batch in enumerate(test_dataloader):
        # Progress update
        if step % 100 == 0 and not step == 0:
            print(f'  Predicting Batch {step} of {len(test_dataloader)}.')

        b_input_ids, b_input_mask = [b.to(device) for b in batch]

        # Perform a forward pass to get logits
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)
        logits = outputs.logits

        # Move logits to CPU and get predicted labels
        logits = logits.detach().cpu().numpy()
        preds = np.argmax(logits, axis=1).flatten()
        test_preds.extend(preds)

# Decode predictions back to original string labels using the label_encoder
predicted_subjects = label_encoder.inverse_transform(test_preds)
test_df['Predicted_Subject'] = predicted_subjects

# --- Save Predictions ---
# Save the test dataframe with the new 'Predicted_Subject' column to a CSV file.
try:
    test_df.to_csv(OUTPUT_FILE, index=False)
    print(f"\nPredictions saved to '{OUTPUT_FILE}'")
except Exception as e:
    print(f"Error saving predictions to CSV: {e}")

Successfully loaded 'train.csv' and 'test.csv'.
Cleaning text data...
Text cleaning complete.
Encoding subject labels...
Found 7 unique subjects: ['Computer Sciences', 'Gaming', 'Geography', 'History', 'Natural Sciences', 'Pop Culture', 'Sports']
Label encoding complete.
Splitting data into training and validation sets...
Training set size: 9000 samples
Validation set size: 1000 samples
Loading BERT tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenizer loaded.
Tokenizing 9000 samples with max_len=128...
Tokenization complete (with labels).
Tokenizing 1000 samples with max_len=128...
Tokenization complete (with labels).
Tokenizing 4020 samples with max_len=128...
Tokenization complete (without labels).
Creating DataLoaders with batch_size=32...
DataLoaders created.
Using device: cuda


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT model loaded and moved to device.
Optimizer initialized with learning rate: 2e-05

Starting training process...

===== Epoch 1/3 =====
  Batch 100 of 282.
  Batch 200 of 282.

Average training loss: 0.6980
Running Validation...
Validation Accuracy: 0.8990

===== Epoch 2/3 =====
  Batch 100 of 282.
  Batch 200 of 282.

Average training loss: 0.3519
Running Validation...
Validation Accuracy: 0.9020

===== Epoch 3/3 =====
  Batch 100 of 282.
  Batch 200 of 282.

Average training loss: 0.2778
Running Validation...
Validation Accuracy: 0.9030

Training complete. Predicting on test data...
  Predicting Batch 100 of 126.

Predictions saved to 'test_with_predictions.csv'


In [2]:
data = pd.read_csv("test_with_predictions.csv")

In [6]:
new_data = data[['ID', 'Predicted_Subject']]

In [7]:
new_data.rename(columns={'Predicted_Subject':'Subject'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data.rename(columns={'Predicted_Subject':'Subject'}, inplace=True)


In [9]:
new_data.to_csv('results.csv', index = False)