In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter
import os
from transformers import DistilBertTokenizerFast

DATASET = ["meta_Beauty_and_Personal_Care", "meta_Books", "meta_Home_and_Kitchen"]
LABELS = ["personal_care", "book", "home"]
COLUMN_SELECTIONS = ["main_category", "title", "features"]

In [2]:
documents = []
for label in LABELS:
    # Read the extracted raw data
    df_temp = pd.read_parquet(f"../../data/raw/{label}.parquet")
    df_temp.title = df_temp.title.astype("str")
    df_temp.features = df_temp.features.astype("str")
    # Convert text to lower case
    df_temp = df_temp.drop("main_category", axis=1).apply(lambda x: x.str.lower())
    # Append non-empty string to documents
    documents += df_temp.title[df_temp.title != ""].to_list()
    documents += df_temp.features[df_temp.features != ""].to_list()

print(len(documents))

5400431


In [3]:
# Setup backend device
if torch.cuda.is_available():
    # Check for CUDA (traditional GPUs)
    device = torch.device("cuda")
    print("PyTorch is using CUDA.")
elif torch.backends.mps.is_available():
    # Check for MPS (Apple Silicon GPUs)
    device = torch.device("mps")
    print("PyTorch is using MPS.")
else:
    device = torch.device("cpu")
    print("PyTorch is using CPU.")

PyTorch is using MPS.


In [4]:
# Parameters
embedding_dim = 256
batch_size = 4
epochs = 10
learning_rate = 0.01
context_size = 5

In [5]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [6]:
tokenized_docs = []
for i, doc in enumerate(documents[:1000]):
    tokenized_docs.append(tokenizer.tokenize(doc))
    if i % 50000 == 0:
        print(f"Finish {i} docs")

# Get vocabulary
vocab = tokenizer.vocab

Finish 0 docs


In [7]:
# Generate context-target pairs for CBOW
def create_cbow_pairs(tokenized_docs, context_size):
    pairs = []
    for doc in tokenized_docs:
        if len(doc) < context_size:
            continue
        for i in range(context_size, len(doc) - context_size):
            context = doc[i-context_size : i] + doc[i + 1 : i + context_size + 1]
            target = doc[i]
            pairs.append((context, target))
    return pairs

cbow_pairs = create_cbow_pairs(tokenized_docs, context_size)

# Dataset and Dataloader for CBOW
class CBOWDataset(Dataset):
    def __init__(self, cbow_pairs, tokenizer, context_size):
        self.data = [(torch.tensor(tokenizer.convert_tokens_to_ids(context), dtype=torch.long), 
                      tokenizer.convert_tokens_to_ids(target)) for context, target in cbow_pairs]
        self.context_size = context_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        return context, torch.tensor(target, dtype=torch.long)

dataset = CBOWDataset(cbow_pairs, tokenizer, context_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [8]:
# Model Architecture
class WordEmbeddings(nn.Module):
    def __init__(self):
        super().__init__()

        self.embeddings = nn.Embedding(num_embeddings=len(vocab), embedding_dim=embedding_dim)
        self.linear_1 = nn.Linear(in_features=embedding_dim, out_features=len(vocab))

    def forward(self, x):
        x = self.embeddings(x)
        x = x.mean(axis=1)
        x = self.linear_1(x)
        return x

# Training Setup
model = WordEmbeddings()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.95)
loss_fn = nn.CrossEntropyLoss()

In [10]:
# Training Loop
for epoch in range(epochs):
    train_loss = 0
    model.train()
    for feature, label in dataloader:
        model = model.to(device)
        feature = feature.to(device)
        label = label.to(device)

        y_train_pred = model(feature)

        loss = loss_fn(y_train_pred, label)
        train_loss = train_loss + loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    print(f"Epoch:{epoch} | Training Loss : {train_loss}")

Epoch:0 | Training Loss : 1.3389217853546143
Epoch:1 | Training Loss : 1.1921583414077759
Epoch:2 | Training Loss : 1.0680516958236694
Epoch:3 | Training Loss : 0.9571227431297302
Epoch:4 | Training Loss : 0.8656147718429565
Epoch:5 | Training Loss : 0.78424471616745
Epoch:6 | Training Loss : 0.7172321677207947
Epoch:7 | Training Loss : 0.6511171460151672
Epoch:8 | Training Loss : 0.5987640023231506
Epoch:9 | Training Loss : 0.5466225147247314


In [None]:
embedding_matrix = model.embeddings.state_dict()['weight']

In [11]:
embedding_weights = model.embeddings.weight.data.cpu().numpy()
# Save the embedding weights to a .npy file
np.save('embedding_weights.npy', embedding_weights)

In [None]:
# # Dataset and Dataloader
# class TextDataset(Dataset):
#     def __init__(self, tokenized_docs, word_to_idx, max_len=None):
#         self.data = [[word_to_idx.get(word, word_to_idx["<UNK>"]) for word in doc] for doc in tokenized_docs]
#         if max_len:
#             self.max_len = max_len
#         else:
#             self.max_len = max(len(doc) for doc in self.data)

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         doc = self.data[idx]
#         if len(doc) > self.max_len:
#             padded_doc = doc[:self.max_len]
#         else:
#             padded_doc = doc + [word_to_idx["<PAD>"]] * (self.max_len - len(doc))
#         return torch.tensor(padded_doc, dtype=torch.long)

# dataset = TextDataset(documents, word_to_idx)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)