In [1]:
import numpy as np
import re
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Load dataset
categories = ['sci.space', 'rec.autos', 'comp.graphics', 'talk.politics.misc']  # optional
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

texts = data['data']
labels = data['target']

# Define basic tokenizer and cleaner (no nltk)
def preprocess(text):
    text = text.lower()                               # lowercase
    text = re.sub(r'[^a-z\s]', '', text)              # remove punctuation/numbers
    tokens = text.split()                             # simple whitespace tokenization
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]  # remove stopwords
    return tokens

# Apply preprocessing
tokenized_texts = [preprocess(doc) for doc in texts]

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Train/test split
X_train_tokens, X_test_tokens, y_train, y_test = train_test_split(tokenized_texts, y, test_size=0.2, random_state=42)

# Example output
print("Tokenized sample:", X_train_tokens[0])
print("Label:", y_train[0])

Tokenized sample: ['interested', 'purchasing', 'grayscale', 'printer', 'offers', 'good', 'resoltuion', 'grayscale', 'medical', 'images', 'anybody', 'recommendations', 'products', 'market', 'particular', 'thank', 'advice']
Label: 0


In [2]:
from collections import defaultdict
from torch.nn.utils.rnn import pad_sequence
import torch

# Build vocabulary
word2idx = {'<PAD>': 0, '<UNK>': 1}
idx = 2
for tokens in X_train_tokens:
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = idx
            idx += 1

# Encode tokens
def encode(tokens):
    return [word2idx.get(token, word2idx['<UNK>']) for token in tokens]

X_train_ids = [torch.tensor(encode(tokens)) for tokens in X_train_tokens]
X_test_ids = [torch.tensor(encode(tokens)) for tokens in X_test_tokens]

In [4]:
# Pad sequences to same length (batch dimension first)
X_train_padded = pad_sequence(X_train_ids, batch_first=True, padding_value=0)
X_test_padded = pad_sequence(X_test_ids, batch_first=True, padding_value=0)

In [8]:
import os
import zipfile
import urllib.request
import numpy as np
import torch
import torch.nn as nn

# === Step 1: Download GloVe if not already present ===
glove_dir = './glove'
glove_zip_path = os.path.join(glove_dir, 'glove.6B.zip')
glove_txt_path = os.path.join(glove_dir, 'glove.6B.100d.txt')

os.makedirs(glove_dir, exist_ok=True)

if not os.path.exists(glove_txt_path):
    print("Downloading GloVe embeddings...")
    url = 'http://nlp.stanford.edu/data/glove.6B.zip'
    urllib.request.urlretrieve(url, glove_zip_path)
    print("Extracting...")
    with zipfile.ZipFile(glove_zip_path, 'r') as zip_ref:
        zip_ref.extractall(glove_dir)
    print("GloVe download and extraction complete.")
else:
    print("GloVe already downloaded.")

# === Step 2: Load GloVe into dictionary ===
embedding_index = {}
with open(glove_txt_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        embedding_index[word] = vector

# === Step 3: Create embedding matrix ===
embedding_dim = 100
vocab_size = len(word2idx)

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word2idx.items():
    vector = embedding_index.get(word)
    if vector is not None:
        embedding_matrix[i] = vector
    else:
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))  # for OOV words

# === Step 4: Convert to tensor and create embedding layer ===
embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float32)

# freeze=False allows fine-tuning the embeddings
embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=False, padding_idx=0)

Downloading GloVe embeddings...
Extracting...
GloVe download and extraction complete.
