# Synthetic Benchmark EDA
the goal of the notebook is to explore the datasets used for synthetic benchmarks

In [11]:
import os
import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
from typing import Dict
import numpy as np
from tqdm import tqdm
from collections import Counter

root_path = os.path.abspath('..')
print(root_path)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

/home/gkoren/scratch/code/github/guyk1971/safari
cuda


## Associative recall

In [12]:
data_root_path=os.path.join(root_path,'datasets','assoc_recall')
ds_files = [f for f in os.listdir(data_root_path) if f.endswith('.pt')]
ds_files

['save_test_assoc_recall_4000_30_131072.pt',
 'train_assoc_recall_4000_60_256.pt',
 'test_assoc_recall_4000_60_256.pt',
 'train_assoc_recall_4000_100_256.pt',
 'test_assoc_recall_4000_100_256.pt',
 'train_assoc_recall_4000_100_4096.pt',
 'test_assoc_recall_4000_100_4096.pt',
 'train_assoc_recall_4000_80_1024.pt',
 'test_assoc_recall_4000_80_1024.pt',
 'train_assoc_recall_4000_80_4096.pt',
 'test_assoc_recall_4000_80_4096.pt',
 'train_assoc_recall_4000_30_1024.pt',
 'test_assoc_recall_4000_30_1024.pt',
 'train_assoc_recall_4000_70_1024.pt',
 'test_assoc_recall_4000_70_1024.pt',
 'train_assoc_recall_4000_70_4096.pt',
 'test_assoc_recall_4000_70_4096.pt',
 'train_assoc_recall_4000_30_4096.pt',
 'test_assoc_recall_4000_30_4096.pt',
 'train_assoc_recall_4000_30_512.pt',
 'test_assoc_recall_4000_30_512.pt',
 'train_assoc_recall_4000_60_4096.pt',
 'test_assoc_recall_4000_60_4096.pt',
 'save_train_assoc_recall_4000_30_131072.pt']

In [13]:
# load a dataset
num_examples=4000
vocab_size=60
input_seq_len=4096
train_tensor = torch.load(os.path.join(data_root_path, 
    f"train_assoc_recall_{num_examples}_{vocab_size}_{input_seq_len}.pt"))
test_tensor = torch.load(os.path.join(data_root_path, 
    f"test_assoc_recall_{num_examples}_{vocab_size}_{input_seq_len}.pt"))

print(train_tensor.shape)
print(test_tensor.shape)

torch.Size([4000, 2, 4098])
torch.Size([500, 2, 4098])


In [14]:
# we need to check if there is an example where the last element in the sample is appearing for the 1st time
cond_trn=[t[0,-1] in t[0,:-1] for t in train_tensor]
print(f'the portion of training set samples in which the query appeared in the prompt is {np.sum(cond_trn)/len(cond_trn)}')

cond_tst=[t[0,-1] in t[0,:-1] for t in test_tensor]
print(f'the portion of training set samples in which the query appeared in the prompt is {np.sum(cond_tst)/len(cond_tst)}')


the portion of training set samples in which the query appeared in the prompt is 1.0
the portion of training set samples in which the query appeared in the prompt is 1.0


In [5]:
train_tensor[0,0,:]

tensor([23, 49,  8, 42,  7, 34, 26, 47,  4, 44, 20, 44,  3, 57,  7, 34, 10, 36,
        18, 52,  4, 44, 28, 57, 21, 42, 21, 42, 29, 51, 29, 51, 19, 40, 25, 51,
         5, 50, 17, 34, 15, 41, 28, 57, 23, 49,  3, 57, 14, 33, 19, 40, 24, 49,
        24, 49, 11, 54, 12, 56,  6, 54, 28, 57, 27, 30, 15, 41, 13, 30, 20, 44,
        29, 51,  3, 57, 29, 51,  2, 35,  5, 50,  2, 35, 15, 41, 16, 34, 27, 30,
         9, 51, 11, 54, 25, 51, 10, 36, 21, 42,  9, 51, 22, 35, 26, 47,  4, 44,
        12, 56, 20, 44, 16, 34,  3, 57, 21, 42, 22, 35, 25, 51, 24, 49,  1, 35,
        14, 33,  8, 42, 16, 34, 20, 44,  7, 34, 17, 34, 15, 41, 12, 56, 18, 52,
         5, 50,  7, 34,  2, 35, 10, 36, 20, 44,  5, 50, 10, 36, 27, 30, 14, 33,
        23, 49,  1, 35, 19, 40,  7, 34,  1, 35,  7, 34, 28, 57, 22, 35, 16, 34,
        10, 36, 22, 35, 26, 47,  7, 34, 24, 49, 16, 34,  5, 50, 27, 30,  1, 35,
        13, 30, 15, 41,  1, 35, 16, 34,  8, 42, 25, 51, 24, 49, 13, 30, 21, 42,
         6, 54, 13, 30, 18, 52,  3, 57, 