In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter
import os

DATASET = ["meta_Beauty_and_Personal_Care", "meta_Books", "meta_Home_and_Kitchen"]
LABELS = ["personal_care", "book", "home"]
COLUMN_SELECTIONS = ["main_category", "title", "features"]

In [19]:
documents = []
for label in LABELS:
    # Read the extracted raw data
    df_temp = pd.read_parquet(f"../../data/raw/{label}.parquet")
    df_temp.title = df_temp.title.astype("str")
    df_temp.features = df_temp.features.astype("str")
    # Convert text to lower case
    df_temp = df_temp.drop("main_category", axis=1).apply(lambda x: x.str.lower())
    # Append non-empty string to documents
    documents.extend(df_temp.title[df_temp.title != ""].to_list())
    documents.extend(df_temp.features[df_temp.features != ""].to_list())

In [20]:
def tokenize_string(text):
    # Punctuations are replaced by .
    text = re.sub(r"[,!?;]", ".", text)
    # Tokenize string to words
    text = nltk.word_tokenize(text)
    # Drop non-alphabetical tokens
    text = [t for t in text if t.isalpha() or t == "."]
    return text

In [21]:
# Preprocess and tokenize documents
tokenized_docs = [tokenize_string(doc) for doc in documents[:100]] ### Reduce to test code
# Count frequency of each token
word_count = Counter(token for doc in tokenized_docs for token in doc)
# Select the most common token
vocab = word_count.most_common(32000)
vocab = {word: i for i, (word, _) in enumerate(vocab)}

In [24]:
word_to_idx = {word: idx for idx, (word, _) in enumerate(vocab.items(), start=2)}
word_to_idx["<PAD>"] = 0
word_to_idx["<UNK>"] = 1
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Setup backend device
if torch.cuda.is_available():
    # Check for CUDA (traditional GPUs)
    device = torch.device("cuda")
    print("PyTorch is using CUDA.")
elif torch.backends.mps.is_available():
    # Check for MPS (Apple Silicon GPUs)
    device = torch.device("mps")
    print("PyTorch is using MPS.")
else:
    device = torch.device("cpu")
    print("PyTorch is using CPU.")

PyTorch is using MPS.


In [25]:
# Parameters
embedding_dim = 50
batch_size = 4
epochs = 10
learning_rate = 0.01
context_size = 2

In [26]:
# Generate context-target pairs for CBOW
def create_cbow_pairs(tokenized_docs, context_size):
    pairs = []
    for doc in tokenized_docs:
        if len(doc) < context_size:
            continue
        for i in range(context_size, len(doc) - context_size):
            context = doc[i-context_size : i] + doc[i + 1 : i + context_size + 1]
            target = doc[i]
            pairs.append((context, target))
    return pairs

cbow_pairs = create_cbow_pairs(tokenized_docs, context_size)

# Dataset and Dataloader for CBOW
class CBOWDataset(Dataset):
    def __init__(self, cbow_pairs, word_to_idx, context_size):
        self.data = [(torch.tensor([word_to_idx.get(word, word_to_idx["<UNK>"]) for word in context], dtype=torch.long), 
                      word_to_idx.get(target, word_to_idx["<UNK>"])) for context, target in cbow_pairs]
        self.context_size = context_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        return context, torch.tensor(target, dtype=torch.long)

dataset = CBOWDataset(cbow_pairs, word_to_idx, context_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [27]:
# Model Architecture
class WordEmbeddings(nn.Module):
    def __init__(self):
        super().__init__()

        self.embeddings = nn.Embedding(num_embeddings=len(vocab), embedding_dim=300)
        self.linear_1 = nn.Linear(in_features=300, out_features=len(vocab))

    def forward(self, x):
        x = self.embeddings(x)
        x = x.mean(axis=1)
        x = self.linear_1(x)
        return x

# Training Setup
model = WordEmbeddings()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.95)
loss_fn = nn.CrossEntropyLoss()

# Training Loop
for epoch in range(11):
    train_loss = 0
    model.train()
    for feature, label in dataloader:
        model = model.to(device)
        feature = feature.to(device)
        label = label.to(device)

        y_train_pred = model(feature)

        loss = loss_fn(y_train_pred, label)
        train_loss = train_loss + loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    print(f"Epoch:{epoch} | Training Loss : {train_loss}")


Epoch:0 | Training Loss : 6.220404624938965
Epoch:1 | Training Loss : 3.1281750202178955
Epoch:2 | Training Loss : 1.4023734331130981
Epoch:3 | Training Loss : 0.6749862432479858
Epoch:4 | Training Loss : 0.3959619700908661
Epoch:5 | Training Loss : 0.2872588038444519
Epoch:6 | Training Loss : 0.2239122837781906
Epoch:7 | Training Loss : 0.1773415356874466
Epoch:8 | Training Loss : 0.15400497615337372
Epoch:9 | Training Loss : 0.13397695124149323
Epoch:10 | Training Loss : 0.11774098128080368


In [28]:
embedding_matrix = model.embeddings.state_dict()['weight']

In [33]:
embedding_matrix.shape

torch.Size([782, 300])

In [34]:
vocab

{'.': 0,
 'hair': 1,
 'for': 2,
 'with': 3,
 'women': 4,
 'and': 5,
 'human': 6,
 'oz': 7,
 'nail': 8,
 'black': 9,
 'makeup': 10,
 'face': 11,
 'body': 12,
 'wig': 13,
 'extensions': 14,
 'color': 15,
 'wigs': 16,
 'inch': 17,
 'in': 18,
 'nails': 19,
 'natural': 20,
 'head': 21,
 'professional': 22,
 'soap': 23,
 'girls': 24,
 'of': 25,
 'liquid': 26,
 'eye': 27,
 'set': 28,
 'brush': 29,
 'dry': 30,
 'curly': 31,
 'cover': 32,
 'by': 33,
 'ombre': 34,
 'art': 35,
 'long': 36,
 'travel': 37,
 'clip': 38,
 'colors': 39,
 'kids': 40,
 'comb': 41,
 'brazilian': 42,
 'straight': 43,
 'size': 44,
 'cream': 45,
 'matte': 46,
 'gel': 47,
 'bundles': 48,
 'lipstick': 49,
 'ounce': 50,
 'bag': 51,
 'lip': 52,
 'pack': 53,
 'eyeliner': 54,
 'to': 55,
 'crystal': 56,
 'skin': 57,
 'fl': 58,
 'on': 59,
 'party': 60,
 'lace': 61,
 'baby': 62,
 'wash': 63,
 'brushes': 64,
 'clippers': 65,
 'sunscreen': 66,
 'brown': 67,
 'powder': 68,
 'multicolor': 69,
 'eyebrow': 70,
 'butterfly': 71,
 'acrylic'

In [None]:
# # Dataset and Dataloader
# class TextDataset(Dataset):
#     def __init__(self, tokenized_docs, word_to_idx, max_len=None):
#         self.data = [[word_to_idx.get(word, word_to_idx["<UNK>"]) for word in doc] for doc in tokenized_docs]
#         if max_len:
#             self.max_len = max_len
#         else:
#             self.max_len = max(len(doc) for doc in self.data)

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         doc = self.data[idx]
#         if len(doc) > self.max_len:
#             padded_doc = doc[:self.max_len]
#         else:
#             padded_doc = doc + [word_to_idx["<PAD>"]] * (self.max_len - len(doc))
#         return torch.tensor(padded_doc, dtype=torch.long)

# dataset = TextDataset(documents, word_to_idx)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)