In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter
import os
from transformers import DistilBertTokenizerFast
from math import ceil

DATASET = ["meta_Beauty_and_Personal_Care", "meta_Books", "meta_Home_and_Kitchen"]
LABELS = ["personal_care", "book", "home"]
COLUMN_SELECTIONS = ["main_category", "title", "features"]

In [2]:
# Setup backend device
if torch.cuda.is_available():
    # Check for CUDA (traditional GPUs)
    device = torch.device("cuda")
    print("PyTorch is using CUDA.")
elif torch.backends.mps.is_available():
    # Check for MPS (Apple Silicon GPUs)
    device = torch.device("mps")
    print("PyTorch is using MPS.")
else:
    device = torch.device("cpu")
    print("PyTorch is using CPU.")

PyTorch is using CUDA.


In [8]:
# documents = []
# for label in LABELS:
#     # Read the extracted raw data
#     df_temp = pd.read_parquet(f"../../data/raw/{label}.parquet")
#     df_temp.title = df_temp.title.astype("str")
#     df_temp.features = df_temp.features.astype("str")
#     # Convert text to lower case
#     df_temp = df_temp.drop("main_category", axis=1).apply(lambda x: x.str.lower())
#     # Append non-empty string to documents
#     documents += df_temp.title[df_temp.title != ""].to_list()
#     documents += df_temp.features[df_temp.features != ""].to_list()

# print(len(documents))

In [3]:
documents = []
label = LABELS[0]
# Read the extracted raw data
df_temp = pd.read_parquet(f"../../data/raw/{label}.parquet")
df_temp.title = df_temp.title.astype("str")
df_temp.features = df_temp.features.astype("str")
# Convert text to lower case
df_temp = df_temp.drop("main_category", axis=1).apply(lambda x: x.str.lower())
# Append non-empty string to documents
documents += df_temp.title[df_temp.title != ""].to_list()
documents += df_temp.features[df_temp.features != ""].to_list()

print(len(documents))

1725905


In [4]:
# Parameters
embedding_dim = 256
batch_size = 4
epochs = 15
learning_rate = 0.01
context_size = 5

In [5]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [6]:
# Get vocabulary
vocab = tokenizer.vocab

In [7]:
# Generate context-target pairs for CBOW
def create_cbow_pairs(documents, context_size, pairs=None, verbose=False):
    # If pairs is empty then initialze an empty array
    if pairs == None:
        pairs = []
    for i in range(len(documents)):
        doc = tokenizer(documents[i], return_tensors="pt")["input_ids"][0]
        if verbose:
            if i % 10000 == 0:
                print("Start", i)
        if len(doc) < context_size:
            continue
        for i in range(context_size, len(doc) - context_size):
            context = doc[i-context_size : i] + doc[i + 1 : i + context_size + 1]
            target = doc[i]
            pairs.append((context, target))
    return pairs


# # This loop prepares training data and reduce memory store in documents
# minibatch = 100000
# original_length = len(documents)
# num_iter = ceil(original_length / minibatch)

# cbow_pairs = []
# print("The total of iteration is", num_iter)
# for i in range(num_iter):
#     print(f"Start {i * minibatch}")
#     cutoff = minibatch if minibatch < len(documents) else len(documents)
#     cbow_pairs += create_cbow_pairs(documents[:cutoff], context_size)
#     # Reduce size of documents
#     documents = documents[cutoff:]
cbow_pairs = create_cbow_pairs(documents[:100], context_size, verbose=True)

Start 0


In [8]:
# Dataset and Dataloader for CBOW
class CBOWDataset(Dataset):
    def __init__(self, cbow_pairs, context_size):
        self.data = cbow_pairs
        self.context_size = context_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        return context, torch.tensor(target, dtype=torch.long)

dataset = CBOWDataset(cbow_pairs, context_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [9]:
# Model Architecture
class WordEmbeddings(nn.Module):
    def __init__(self):
        super().__init__()

        self.embeddings = nn.Embedding(num_embeddings=len(vocab), embedding_dim=embedding_dim)
        self.linear_1 = nn.Linear(in_features=embedding_dim, out_features=len(vocab))

    def forward(self, x):
        x = self.embeddings(x)
        x = x.mean(axis=1)
        x = self.linear_1(x)
        return x

# Training Setup
model = WordEmbeddings()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.95)
loss_fn = nn.CrossEntropyLoss()

In [10]:
# Training Loop
for epoch in range(epochs):
    train_loss = 0
    model.train()
    for feature, label in dataloader:
        model = model.to(device)
        feature = feature.to(device)
        label = label.to(device)

        y_train_pred = model(feature)

        loss = loss_fn(y_train_pred, label)
        train_loss = train_loss + loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    print(f"Epoch:{epoch} | Training Loss : {train_loss}")

  return context, torch.tensor(target, dtype=torch.long)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
embedding_matrix = model.embeddings.state_dict()['weight']

In [None]:
embedding_weights = model.embeddings.weight.data.cpu().numpy()
# Save the embedding weights to a .npy file
np.save('embedding_weights.npy', embedding_weights)

In [None]:
# # Dataset and Dataloader
# class TextDataset(Dataset):
#     def __init__(self, tokenized_docs, word_to_idx, max_len=None):
#         self.data = [[word_to_idx.get(word, word_to_idx["<UNK>"]) for word in doc] for doc in tokenized_docs]
#         if max_len:
#             self.max_len = max_len
#         else:
#             self.max_len = max(len(doc) for doc in self.data)

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         doc = self.data[idx]
#         if len(doc) > self.max_len:
#             padded_doc = doc[:self.max_len]
#         else:
#             padded_doc = doc + [word_to_idx["<PAD>"]] * (self.max_len - len(doc))
#         return torch.tensor(padded_doc, dtype=torch.long)

# dataset = TextDataset(documents, word_to_idx)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)