In [68]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

### Load data and build the context window

In [69]:
CONTEXT_SIZE=3

In [70]:
def contexted_data(words):
    data = []

    for i in range(CONTEXT_SIZE, len(words) - CONTEXT_SIZE):
        context = (
            [words[i - j - 1] for j in range(CONTEXT_SIZE)]
            + [words[i + j + 1] for j in range(CONTEXT_SIZE)]
        )
        target = words[i]
        data.append((context, target))
    return data



In [71]:
data_set_path="kaggle_data/initial_data_set.csv"

In [72]:
data_set=pd.read_csv(data_set_path)

In [73]:
data_set_list=data_set.apply(lambda row: row.tolist(), axis=1).tolist()

In [74]:
vocab=np.unique(data_set.values).tolist()
vocab_size = len(vocab)
vocab_size
word_to_ix = {word: i for i, word in enumerate(vocab)}
embedding_size=30

vocab_size

119

In [75]:
built_word_data=[]
for customer in data_set_list:
    built_word_data=built_word_data+contexted_data(customer)

In [76]:
built_word_data[:3]

[(['Trousers', 'Sweater', 'Trousers', 'Trousers', 'Trousers', 'Top'],
  'Sweater'),
 (['Sweater', 'Trousers', 'Sweater', 'Trousers', 'Top', 'Trousers'],
  'Trousers'),
 (['Trousers', 'Sweater', 'Trousers', 'Top', 'Trousers', 'Sweater'],
  'Trousers')]

### Split data

In [77]:
train_data, int_data = train_test_split(built_word_data, test_size=0.3, random_state=42)
val_data, test_data  = train_test_split(built_word_data, test_size=0.5, random_state=42)

### Model Buildling

In [78]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.fc1 = nn.Linear(embedding_size, vocab_size)
        

    def forward(self, x):
        embeds = self.embedding(x)
        sum_embeds = torch.sum(embeds, dim=(1))  # sum over embedding dimension and context window size
        out = self.fc1(sum_embeds)
        log_probs = nn.functional.log_softmax(out, dim=1)
        return out



In [None]:
embedding_size = 20
model = CBOW(vocab_size, embedding_size)
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [80]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

In [81]:
for i in range(3):
    inputs, target = val_data[i]
    print(make_context_vector(inputs, word_to_ix))


tensor([48, 49, 26, 90, 90, 43])
tensor([106, 106, 106,  90,  90, 101])
tensor([98, 98,  9, 95, 95, 95])


In [82]:
class CBOWDataset(Dataset):
    def __init__(self, data, word_to_ix, CONTEXT_SIZE):
        self.data = data
        self.word_to_ix = word_to_ix
        self.CONTEXT_SIZE = CONTEXT_SIZE

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        inputs, target = self.data[idx]
        context = make_context_vector(inputs, self.word_to_ix)
        target = torch.tensor(self.word_to_ix[target], dtype=torch.long)
        return context, target


    



In [83]:
train_dataset = CBOWDataset(train_data, word_to_ix,CONTEXT_SIZE)
val_dataset = CBOWDataset(val_data, word_to_ix,CONTEXT_SIZE)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

cbow = CBOW(vocab_size,embedding_size)

In [None]:
epochs = 10
training_loss=[]
validation_loss=[]
for epoch in range(epochs):
    train_loss = 0.0
    val_loss = 0.0
    cbow.train()
    for i, (inputs, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = cbow(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    training_loss.append(loss.item())

    cbow.eval()
    with torch.no_grad():
        for i, (inputs, targets) in enumerate(val_loader):
            outputs = cbow(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()
            
    validation_loss.append(val_loss)
    print(f'End of epoch {epoch}')

In [None]:
from matplotlib import pyplot as plt

plt.plot(training_loss)

In [None]:
plt.plot(validation_loss)