<a href="https://colab.research.google.com/github/dornercr/INFO371/blob/main/INFO371_Week9_NLP_Lab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# INFO 371: NLP Lab – Full Lecture Parallel (AG News)
# Author: Charles Dorner, EdD (Candidate)

# ✅ Fix AG_NEWS loading issue by installing torchdata
!pip install torchdata --quiet

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score

# 🔧 Manual Tensor Examples (Matching Lecture Style)
tensor_from_list = torch.tensor([1, 2, 3])
print("Manual tensor:", tensor_from_list)

rand_tensor = torch.rand(2, 3)
print("Random 2x3 tensor:", rand_tensor)

range_tensor = torch.arange(0, 6, dtype=torch.float32)
print("Range tensor:", range_tensor)

# 📥 Load AG News Dataset
import pandas as pd

# ✅ AG News CSVs manually loaded (alternative to AG_NEWS())
train_df = pd.read_csv("https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv", header=None)
test_df = pd.read_csv("https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv", header=None)

train_texts = train_df[1] + " " + train_df[2]
train_labels = train_df[0] - 1  # Zero-index
test_texts = test_df[1] + " " + test_df[2]
test_labels = test_df[0] - 1


# 🔠 Tokenization and Vocabulary
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data):
    for text in data:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_texts), specials=["<pad>"])
vocab.set_default_index(vocab["<pad>"])

# 🧱 Encoding and Padding
def encode(text):
    return torch.tensor(vocab(tokenizer(text)), dtype=torch.long)

def collate_batch(batch):
    text_list, label_list = [], []
    for _text, _label in batch:
        text_list.append(encode(_text))
        label_list.append(torch.tensor(_label, dtype=torch.long))
    text_list = pad_sequence(text_list, batch_first=True, padding_value=vocab["<pad>"])
    return text_list, torch.stack(label_list)

# 📦 Custom Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

train_ds = TextDataset(train_texts[:10000], train_labels[:10000])
test_ds = TextDataset(test_texts[:2000], test_labels[:2000])

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_batch)
test_dl = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=collate_batch)

# 🧠 Model Definition
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        mean_emb = embedded.mean(dim=1)
        x = self.relu(self.fc1(mean_emb))
        return self.fc2(x)

# ⚙️ Training Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TextClassifier(len(vocab), 64, 32, 4).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# 🔁 Training Loop
for epoch in range(3):
    model.train()
    total_loss = 0
    for batch, (X, y) in enumerate(train_dl):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if batch % 100 == 0:
            print(f"Epoch {epoch+1}, Batch {batch}, Loss: {loss.item():.4f}")
    print(f"Epoch {epoch+1} complete | Total Loss: {total_loss:.4f}")

# 💾 Save Model
torch.save(model.state_dict(), "agnews_nlp_model.pt")
print("Model saved to agnews_nlp_model.pt")

# 📥 Load Model
model.load_state_dict(torch.load("agnews_nlp_model.pt"))
model.eval()

# 🧪 Evaluation (NumPy-safe)
y_true, y_pred = [], []
with torch.no_grad():
    for X, y in test_dl:
        X = X.to(device)
        out = model(X)
        preds = out.argmax(1).cpu().tolist()
        y_true.extend(y.tolist())
        y_pred.extend(preds)

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Classification Report:\n", classification_report(y_true, y_pred))

# 📊 Optional: Misclassification Analysis
for i in range(10):
    if y_true[i] != y_pred[i]:
        print(f"❌ Predicted {y_pred[i]}, Actual {y_true[i]}: {test_texts[i][:100]}")


Manual tensor: tensor([1, 2, 3])
Random 2x3 tensor: tensor([[0.4764, 0.4802, 0.9512],
        [0.6237, 0.9207, 0.2991]])
Range tensor: tensor([0., 1., 2., 3., 4., 5.])
Epoch 1, Batch 0, Loss: 1.4208
Epoch 1, Batch 100, Loss: 1.3731
Epoch 1, Batch 200, Loss: 1.3243
Epoch 1, Batch 300, Loss: 1.2920
Epoch 1 complete | Total Loss: 418.3657
Epoch 2, Batch 0, Loss: 1.2665
Epoch 2, Batch 100, Loss: 1.0192
Epoch 2, Batch 200, Loss: 0.9617
Epoch 2, Batch 300, Loss: 0.5262
Epoch 2 complete | Total Loss: 275.5634
Epoch 3, Batch 0, Loss: 0.6237
Epoch 3, Batch 100, Loss: 0.5182
Epoch 3, Batch 200, Loss: 0.5081
Epoch 3, Batch 300, Loss: 0.5646
Epoch 3 complete | Total Loss: 161.3407
Model saved to agnews_nlp_model.pt
Accuracy: 0.798
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.85      0.81       511
           1       0.90      0.87      0.88       526
           2       0.71      0.75      0.73       449
           3       0.81      0