# ============================================
# AIDI 1002 Final Project — TextCNN-SE (NLP)
# Reproduction + Significant Contributions
# - Depthwise Separable TextCNN + SE attention
# - Label Smoothing CE
# - Adam → SGD + CosineAnnealingLR
# - Embedding Dropout + Word Dropout
# - Windows-safe DataLoaders, FAST toggle
# ============================================


Step 1 - Imports, config, toggles
-

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
import string
import re

# --- Step 1: Data Loading and Preprocessing ---

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove punctuation
    tokens = text.split()
    return tokens

# Load the ag_news dataset
print("Loading the ag_news dataset...")
ag_news = load_dataset("ag_news")

train_texts = [preprocess_text(item) for item in ag_news['train']['text']]
train_labels = ag_news['train']['label']

test_texts = [preprocess_text(item) for item in ag_news['test']['text']]
test_labels = ag_news['test']['label']

# Create a vocabulary
print("Building vocabulary...")
word_counts = Counter()
for text in train_texts:
    word_counts.update(text)

vocab = {word: i + 1 for i, (word, count) in enumerate(word_counts.most_common())}
vocab['<unk>'] = 0  # Add an unknown token

def text_to_tensor(texts, vocab, max_len=None):
    if max_len is None:
        max_len = max(len(t) for t in texts)
    
    encoded_texts = []
    for text in texts:
        encoded = [vocab.get(word, vocab['<unk>']) for word in text]
        if len(encoded) > max_len:
            encoded = encoded[:max_len]
        else:
            encoded = encoded + [0] * (max_len - len(encoded))
        encoded_texts.append(encoded)
    
    return torch.LongTensor(encoded_texts)

# Pad and encode texts
print("Encoding and padding text data...")
max_seq_len = 50
train_tensor = text_to_tensor(train_texts, vocab, max_len=max_seq_len)
test_tensor = text_to_tensor(test_texts, vocab, max_len=max_seq_len)

train_labels_tensor = torch.LongTensor(train_labels)
test_labels_tensor = torch.LongTensor(test_labels)

# Create DataLoader for training and testing
train_dataset = TensorDataset(train_tensor, train_labels_tensor)
test_dataset = TensorDataset(test_tensor, test_labels_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

print("Data preparation complete.")


Loading the ag_news dataset...
Building vocabulary...
Encoding and padding text data...
Data preparation complete.


# --- Step 2: Define the Lightweight CNN Model ---

In [2]:

class LightweightCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.conv_2_3x3 = nn.Sequential(
            nn.Conv1d(in_channels=embedding_dim, out_channels=n_filters, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=n_filters, out_channels=n_filters, kernel_size=3, padding=1)
        )
        
        self.conv_3x3 = nn.Conv1d(in_channels=embedding_dim, out_channels=n_filters, kernel_size=3)
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.embedding(text)
        embedded = embedded.permute(0, 2, 1)

        conv_out_2_3x3 = self.conv_2_3x3(embedded)
        conv_out_3x3 = self.conv_3x3(embedded)
        
        pooled_2_3x3 = F.max_pool1d(conv_out_2_3x3, conv_out_2_3x3.shape[2]).squeeze(2)
        pooled_3x3 = F.max_pool1d(conv_out_3x3, conv_out_3x3.shape[2]).squeeze(2)
        
        pooled_features = torch.cat((pooled_2_3x3, pooled_3x3), dim=1)
        
        logits = self.fc(self.dropout(pooled_features))
        
        return logits

# --- Step 3: Training and Evaluation Functions ---

In [3]:

def train_model(model, train_loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        optimizer.zero_grad()
        logits = model(texts)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []

    with torch.no_grad():
        for texts, labels in loader:
            logits = model(texts)
            loss = criterion(logits, labels)
            total_loss += loss.item()
            
            predicted_classes = torch.argmax(logits, dim=1)
            predictions.extend(predicted_classes.tolist())
            true_labels.extend(labels.tolist())

    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')

    return accuracy, f1, total_loss / len(loader)


# --- Step 4: Run the Training and Evaluation Workflow ---

In [4]:

# Hyperparameters
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3, 5]
OUTPUT_DIM = len(set(train_labels))
DROPOUT = 0.5
PAD_IDX = vocab['<unk>']
N_EPOCHS = 5

# Initialize model, optimizer, and loss function
model = LightweightCNN(VOCAB_SIZE, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

print("\nStarting training...")
for epoch in range(N_EPOCHS):
    train_loss = train_model(model, train_loader, optimizer, criterion)
    print(f"Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}")

print("\nEvaluating on the test set...")
test_accuracy, test_f1, test_loss = evaluate(model, test_loader, criterion)

print(f'\nTest Loss: {test_loss:.3f}')
print(f'Test Accuracy: {test_accuracy:.3f}')
print(f'Test F1 Score: {test_f1:.3f}')


Starting training...
Epoch: 01 | Train Loss: 0.584
Epoch: 02 | Train Loss: 0.315
Epoch: 03 | Train Loss: 0.243
Epoch: 04 | Train Loss: 0.197
Epoch: 05 | Train Loss: 0.163

Evaluating on the test set...

Test Loss: 0.306
Test Accuracy: 0.909
Test F1 Score: 0.909
