### Download the embeddings from wandb

In [None]:
import wandb
run = wandb.init()
# Use the embeddings artifact
embeddings_artifact = run.use_artifact('bryars-bryars/cbow-wiki/embeddings:v4', type='embeddings')
embeddings_dir = embeddings_artifact.download(root='temp_data')

# Use the model weights artifact
model_artifact = run.use_artifact('bryars-bryars/cbow-wiki/model-weights:v6', type='model')
model_dir = model_artifact.download(root='temp_data')

# Now you have both directories
print(f"Embeddings downloaded to: {embeddings_dir}")
print(f"Model weights downloaded to: {model_dir}")

### Create a Dataset and Data Loader class

In [1]:
# Check that the GPU is being used
import torch

if torch.cuda.is_available():
   print(f"GPU: {torch.cuda.get_device_name(0)}")
   print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
   # Enable cuDNN auto-tuner
   torch.backends.cudnn.benchmark = True

else: 
   print("No GPU available")

GPU: NVIDIA RTX A4000
GPU Memory: 16.78 GB


In [2]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm import tqdm
import numpy as np
import wandb

# First set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [3]:
## Create a class for the dataset
# The dataset is a csv file with two columns: title and score
# Drop NA values
# Tokenize the titles and convert them to embeddings using the CBOW model embeddings
# return titles and scores
class HackerNewsDataset(Dataset):
    def __init__(self, csv_file, word_to_idx, embedding_file):
        df = pd.read_csv(csv_file).dropna(subset=["title", "score"])
        self.titles = df["title"].tolist()
        self.scores = torch.tensor(df["score"].values, dtype=torch.float32)
        
        # load word2idx and embeddings
        self.word_to_idx = torch.load(word_to_idx)
        print("word to id type is :" ,type(self.word_to_idx))

        # The embeddings are saved as a state dict, so we need to load them as a state dict
        state_dict = torch.load(embedding_file)
        # To get the actual embeddings tensor
        if isinstance(state_dict, dict):
            self.embeddings = state_dict['weight'] if 'weight' in state_dict else next(iter(state_dict.values()))
        else:
            self.embeddings = state_dict
        print(f"Embeddings shape: {self.embeddings.shape}")
    
    def _preprocess_title(self, title):
        tokens = title.lower().split()
        word_ids = [self.word_to_idx[word] for word in tokens if word in self.word_to_idx]
        return word_ids

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        title = self.titles[idx]
        score = self.scores[idx]
    
        word_ids = self._preprocess_title(title)

        if len(word_ids) == 0:
            title_embedding = torch.zeros(self.embeddings.shape[1])
        else:
            word_embeddings = self.embeddings[word_ids]  # shape: (num_words, embedding_dim)
            title_embedding = word_embeddings.mean(dim=0)
    
        #return title, score, word_ids, word_embeddings, title_embedding
        return title_embedding, score


In [4]:
## Load the data, word2idx and embeddings
data_url = "https://huggingface.co/datasets/danbhf/hackernews_title_training/resolve/main/hn_title_training_notnorm_2008_2024.csv"
embeddings_path = "./temp_data/embeddings_epoch5_2025_04_17__15_28_43.pt"
word2idx_path = "./temp_data/word_to_id.pt"


In [5]:
dataset = HackerNewsDataset(data_url, word2idx_path, embeddings_path)
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [0.8, 0.2])

# Create data loaders with GPU pinning
train_loader = DataLoader(
   train_dataset,
   batch_size=512,
   shuffle=True,
   pin_memory=True,  # Enable pinning for faster GPU transfer
   num_workers=4     # Use multiple workers for data loading
)

test_loader = DataLoader(
   test_dataset,
   batch_size=512,
   shuffle=False,
   pin_memory=True,
   num_workers=4
)


  self.word_to_idx = torch.load(word_to_idx)
  state_dict = torch.load(embedding_file)


word to id type is : <class 'dict'>
Embeddings shape: torch.Size([30000, 200])


In [None]:
# to use this, add the word_ids, word_embeddings, title_embedding to the dataset return
# Sanity check a sample
title, score, word_ids, word_embeddings, title_embedding = dataset[0]
print(f"Title: {title}")
print(f"Score: {score}")
print(f"Word IDs: {word_ids}")
print(f"Word Embeddings: {word_embeddings}")
print(f"Title Embedding: {title_embedding}")
print("Title embedding length is :", len(title_embedding))


## Create a MLP regressor

In [6]:
## Create a MLP regressor
class Regressor(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.seq = torch.nn.Sequential(
            torch.nn.Linear(in_features=200, out_features=128),
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=128, out_features=64),
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=64, out_features=32),
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=32, out_features=16),
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=16, out_features=1),
        )

    def forward(self, inpt):
        out = self.seq(inpt)
        return out

In [8]:
# Setup regressor
mReg = Regressor().to(device)
optimizer = torch.optim.Adam(mReg.parameters(), lr=0.005)

# Training loop
for epoch in range(5):
    # Training phase
    mReg.train()
    train_losses = []
    train_bar = tqdm(train_loader, desc=f'Epoch {epoch} Training')
    
    for title_embedding, score in train_bar:
        # Move data to device
        title_embedding = title_embedding.to(device)
        score = score.to(device)
        
        # Forward pass
        out = mReg(title_embedding)
        loss = torch.nn.functional.l1_loss(out.squeeze(), score)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Store loss
        train_losses.append(loss.item())
        train_bar.set_postfix({'train_loss': np.mean(train_losses[-100:])})
    
    # Calculate average train loss for the epoch
    avg_train_loss = np.mean(train_losses)
    
    # Testing phase
    mReg.eval()
    test_losses = []
    test_bar = tqdm(test_loader, desc=f'Epoch {epoch} Testing')
    
    with torch.no_grad():
        for title_embedding, score in test_bar:
            # Move data to device
            title_embedding = title_embedding.to(device)
            score = score.to(device)
            
            # Forward pass
            out = mReg(title_embedding)
            loss = torch.nn.functional.l1_loss(out.squeeze(), score)
            
            # Store loss
            test_losses.append(loss.item())
            test_bar.set_postfix({'test_loss': np.mean(test_losses[-100:])})
    
    # Calculate average test loss for the epoch
    avg_test_loss = np.mean(test_losses)
    
    # Log to wandb
    wandb.log({
        'epoch': epoch,
        'train_loss': avg_train_loss,
        'test_loss': avg_test_loss
    })
    
    # Print epoch summary
    print(f'\nEpoch {epoch}: Train Loss = {avg_train_loss:.4f}, Test Loss = {avg_test_loss:.4f}\n')

Epoch 0 Training:   0%|          | 0/7408 [00:00<?, ?it/s]

Epoch 0 Training:   7%|▋         | 553/7408 [00:10<02:04, 55.18it/s, train_loss=13.1]


KeyboardInterrupt: 