In [2]:
import pandas as pd
import numpy as np
import nltk
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter
import os
from transformers import DistilBertTokenizerFast

DATASET = ["meta_Beauty_and_Personal_Care", "meta_Books", "meta_Home_and_Kitchen"]
LABELS = ["personal_care", "book", "home"]
COLUMN_SELECTIONS = ["main_category", "title", "features"]

In [13]:
documents = []
for label in LABELS:
    # Read the extracted raw data
    df_temp = pd.read_parquet(f"../../data/raw/{label}.parquet")
    df_temp.title = df_temp.title.astype("str")
    df_temp.features = df_temp.features.astype("str")
    # Convert text to lower case
    df_temp = df_temp.drop("main_category", axis=1).apply(lambda x: x.str.lower())
    # Append non-empty string to documents
    documents += df_temp.title[df_temp.title != ""].to_list()
    documents += df_temp.features[df_temp.features != ""].to_list()

print(len(documents))

5400431


In [14]:
# Setup backend device
if torch.cuda.is_available():
    # Check for CUDA (traditional GPUs)
    device = torch.device("cuda")
    print("PyTorch is using CUDA.")
elif torch.backends.mps.is_available():
    # Check for MPS (Apple Silicon GPUs)
    device = torch.device("mps")
    print("PyTorch is using MPS.")
else:
    device = torch.device("cpu")
    print("PyTorch is using CPU.")

PyTorch is using MPS.


In [15]:
# Parameters
embedding_dim = 256
batch_size = 4
epochs = 10
learning_rate = 0.01
context_size = 5

In [6]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [18]:
tokenized_docs = []
for i, doc in enumerate(documents):
    tokenized_docs.append(tokenizer.tokenize(doc))
    if i % 50000 == 0:
        print(f"Finish {i} docs")

# Get vocabulary
vocab = tokenizer.vocab

Finish 0 docs
Finish 50000 docs
Finish 100000 docs
Finish 150000 docs
Finish 200000 docs
Finish 250000 docs
Finish 300000 docs
Finish 350000 docs
Finish 400000 docs
Finish 450000 docs
Finish 500000 docs
Finish 550000 docs
Finish 600000 docs
Finish 650000 docs
Finish 700000 docs
Finish 750000 docs
Finish 800000 docs
Finish 850000 docs
Finish 900000 docs
Finish 950000 docs


Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


Finish 1000000 docs
Finish 1050000 docs
Finish 1100000 docs
Finish 1150000 docs
Finish 1200000 docs
Finish 1250000 docs
Finish 1300000 docs
Finish 1350000 docs
Finish 1400000 docs
Finish 1450000 docs
Finish 1500000 docs
Finish 1550000 docs
Finish 1600000 docs
Finish 1650000 docs
Finish 1700000 docs
Finish 1750000 docs
Finish 1800000 docs
Finish 1850000 docs
Finish 1900000 docs
Finish 1950000 docs
Finish 2000000 docs
Finish 2050000 docs
Finish 2100000 docs
Finish 2150000 docs
Finish 2200000 docs
Finish 2250000 docs
Finish 2300000 docs
Finish 2350000 docs
Finish 2400000 docs
Finish 2450000 docs
Finish 2500000 docs
Finish 2550000 docs
Finish 2600000 docs
Finish 2650000 docs
Finish 2700000 docs
Finish 2750000 docs
Finish 2800000 docs
Finish 2850000 docs
Finish 2900000 docs
Finish 2950000 docs
Finish 3000000 docs
Finish 3050000 docs
Finish 3100000 docs
Finish 3150000 docs
Finish 3200000 docs
Finish 3250000 docs
Finish 3300000 docs
Finish 3350000 docs
Finish 3400000 docs
Finish 3450000 docs


In [None]:
# Generate context-target pairs for CBOW
def create_cbow_pairs(tokenized_docs, context_size):
    pairs = []
    for doc in tokenized_docs:
        if len(doc) < context_size:
            continue
        for i in range(context_size, len(doc) - context_size):
            context = doc[i-context_size : i] + doc[i + 1 : i + context_size + 1]
            target = doc[i]
            pairs.append((context, target))
    return pairs

cbow_pairs = create_cbow_pairs(tokenized_docs, context_size)

# Dataset and Dataloader for CBOW
class CBOWDataset(Dataset):
    def __init__(self, cbow_pairs, tokenizer, context_size):
        self.data = [(torch.tensor(tokenizer.convert_tokens_to_ids(context), dtype=torch.long), 
                      tokenizer.convert_tokens_to_ids(target)) for context, target in cbow_pairs]
        self.context_size = context_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        return context, torch.tensor(target, dtype=torch.long)

dataset = CBOWDataset(cbow_pairs, tokenizer, context_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [24]:
# Model Architecture
class WordEmbeddings(nn.Module):
    def __init__(self):
        super().__init__()

        self.embeddings = nn.Embedding(num_embeddings=len(vocab), embedding_dim=embedding_dim)
        self.linear_1 = nn.Linear(in_features=embedding_dim, out_features=len(vocab))

    def forward(self, x):
        x = self.embeddings(x)
        x = x.mean(axis=1)
        x = self.linear_1(x)
        return x

# Training Setup
model = WordEmbeddings()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.95)
loss_fn = nn.CrossEntropyLoss()

In [None]:
# Training Loop
for epoch in range(epochs):
    train_loss = 0
    model.train()
    for feature, label in dataloader:
        model = model.to(device)
        feature = feature.to(device)
        label = label.to(device)

        y_train_pred = model(feature)

        loss = loss_fn(y_train_pred, label)
        train_loss = train_loss + loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    print(f"Epoch:{epoch} | Training Loss : {train_loss}")

In [None]:
embedding_matrix = model.embeddings.state_dict()['weight']

In [None]:
embedding_weights = model.embeddings.weight.data.cpu().numpy()
# Save the embedding weights to a .npy file
np.save('embedding_weights.npy', embedding_weights)

In [None]:
# # Dataset and Dataloader
# class TextDataset(Dataset):
#     def __init__(self, tokenized_docs, word_to_idx, max_len=None):
#         self.data = [[word_to_idx.get(word, word_to_idx["<UNK>"]) for word in doc] for doc in tokenized_docs]
#         if max_len:
#             self.max_len = max_len
#         else:
#             self.max_len = max(len(doc) for doc in self.data)

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         doc = self.data[idx]
#         if len(doc) > self.max_len:
#             padded_doc = doc[:self.max_len]
#         else:
#             padded_doc = doc + [word_to_idx["<PAD>"]] * (self.max_len - len(doc))
#         return torch.tensor(padded_doc, dtype=torch.long)

# dataset = TextDataset(documents, word_to_idx)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)