In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import string
import sys

import torch
from torch.utils.data import IterableDataset
from tqdm.auto import tqdm

sys.path.insert(0, "..")
from src.skipgram.dataset import SkipGramDataset

# Mock data

In [3]:
sequences = [
    ["b", "c", "d", "e", "a"],
    ["f", "b", "b", "b", "k"],
    ["g", "m", "k", "l", "h"],
    ["b", "c", "k"],
    ["j", "i", "c"],
]

with open("sequences.jsonl", "w") as f:
    for sequence in sequences:
        f.write(json.dumps(sequence) + "\n")

# Test Iterable Dataset

In [4]:
class LargeSequenceDataset(IterableDataset):
    def __init__(self, file_path):
        super(LargeSequenceDataset, self).__init__()
        self.file_path = file_path

    def __iter__(self):
        # Open the file and read line by line
        with open(self.file_path, "r") as f:
            # Wrap the file with tqdm to show progress
            for line in tqdm(f, desc="Processing lines"):
                # Parse each line into a Python object
                sequence = json.loads(line)
                for item in sequence:
                    yield item


# Usage
train_dataset = LargeSequenceDataset("sequences.jsonl")

# Example of loading data with a progress bar
for data in train_dataset:
    print(data)

Processing lines: 0it [00:00, ?it/s]

b
c
d
e
a
f
b
b
b
k
g
m
k
l
h
b
c
k
j
i
c


In [5]:
# DataLoader for batching
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2)

# Example of iterating through DataLoader
for batch in train_loader:
    print(batch)

Processing lines: 0it [00:00, ?it/s]

['b', 'c']
['d', 'e']
['a', 'f']
['b', 'b']
['b', 'k']
['g', 'm']
['k', 'l']
['h', 'b']
['c', 'k']
['j', 'i']
['c']


# Test SkipGram implementation

In [6]:
# Simulate pre-configured id_to_idx mapper
id_to_idx = {
    id_: idx for id_, idx in zip(list(string.ascii_letters[:13]), list(range(13)))
}
id_to_idx["a"] = 1
id_to_idx["b"] = 0

# Create dataset with frequency-based negative sampling
dataset = SkipGramDataset(
    "sequences.jsonl", window_size=1, negative_samples=2, id_to_idx=id_to_idx
)

[32m2024-09-29 15:44:10.102[0m | [1mINFO    [0m | [36msrc.skipgram.dataset[0m:[36m__init__[0m:[36m56[0m - [1mProcessing sequences to build interaction data...[0m


Building interactions: 0it [00:00, ?it/s]

In [12]:
dataloader = torch.utils.data.DataLoader(
    dataset, batch_size=2, collate_fn=dataset.collate_fn
)

# Example of iterating through DataLoader
for _ in range(2):
    for batch in dataloader:
        print(batch)
    print("___")

{'target_items': tensor([0, 0, 0, 2, 2, 2, 2, 2, 2]), 'context_items': tensor([ 2,  6,  7,  0,  3,  6,  5, 11,  5]), 'labels': tensor([1., 0., 0., 1., 1., 0., 0., 0., 0.])}
{'target_items': tensor([3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4]), 'context_items': tensor([ 2,  4,  9, 10,  5, 10,  3,  1,  6,  8,  6,  5]), 'labels': tensor([1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.])}
{'target_items': tensor([1, 1, 1, 5, 5, 5]), 'context_items': tensor([ 4, 12,  9,  0,  1,  2]), 'labels': tensor([1., 0., 0., 1., 0., 0.])}
{'target_items': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'context_items': tensor([ 5,  0,  9, 11,  9,  6,  0,  0,  9,  7,  6,  9]), 'labels': tensor([1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.])}
{'target_items': tensor([ 0,  0,  0,  0,  0,  0, 10, 10, 10]), 'context_items': tensor([ 0, 10,  8,  6,  8, 12,  0,  1,  8]), 'labels': tensor([1., 1., 0., 0., 0., 0., 1., 0., 0.])}
{'target_items': tensor([ 6,  6,  6, 12, 12, 12, 12, 12, 12]), 'context_items': tensor([12,  

# Try SkipGram implementation on all data