# Text Classification with PyTorch and TorchText

In [None]:
!pip install torch torchtext

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
tokenizer = get_tokenizer("basic_english")
train_iter = AG_NEWS(split='train')

def yield_tokens(data_iter):
    for label, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
train_iter = AG_NEWS(split='train')
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1
label_dict = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

In [None]:
def collate_batch(batch):
    text_list, label_list = [], []
    for label, text in batch:
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(processed_text)
        label_list.append(torch.tensor(label_pipeline(label), dtype=torch.int64))
    text_list = pad_sequence(text_list, batch_first=True)
    label_list = torch.stack(label_list)
    return text_list.to(device), label_list.to(device)

train_data = list(train_iter)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_batch)

In [None]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_class)

    def forward(self, text):
        embedded = self.embedding(text)
        pooled = embedded.mean(dim=1)
        return self.fc(pooled)

vocab_size = len(vocab)
embed_dim = 64
num_class = len(label_dict)
model = TextClassifier(vocab_size, embed_dim, num_class).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train_model(epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0
        for text_batch, label_batch in train_loader:
            optimizer.zero_grad()
            output = model(text_batch)
            loss = criterion(output, label_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            correct += (output.argmax(1) == label_batch).sum().item()
            total += label_batch.size(0)
        acc = 100 * correct / total
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, Accuracy: {acc:.2f}%")

train_model()

In [1]:
def predict(text):
    model.eval()
    with torch.no_grad():
        text_tensor = torch.tensor(text_pipeline(text)).unsqueeze(0).to(device)
        output = model(text_tensor)
        predicted = output.argmax(1).item()
        return label_dict[predicted]

sample = "NASA launches a new telescope into orbit."
print("Predicted category:", predict(sample))

NameError: name 'model' is not defined