In [5]:
import torch
from torch.utils.data import TensorDataset, DataLoader, ConcatDataset
import os

In [6]:
data_dir = "/projectnb/textconv/llama/tensor_dataset_2/"


In [24]:

class CustomConcatDataset(Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.input_datasets = []
        self.target_datasets = []
        file_ids = []
    
   
        all_files = os.listdir(data_dir)

        for file_name in all_files:
            if file_name.startswith("input") and file_name.endswith(".pt"):
                file_id = file_name[len("input"):-len(".pt")]
                target_file_name = f"target{file_id}.pt"

                # Check if there is a corresponding target file
                if target_file_name in all_files:
                    file_ids.append(int(file_id))
                    
        self.file_ids = file_ids
        
        for id in file_ids:
            input_file_path = os.path.join(data_dir, f'input{id}.pt')
            target_file_path = os.path.join(data_dir, f'target{id}.pt')
            
            # Load input and target tensors
            input_tensor = torch.load(input_file_path)
            target_tensor = torch.load(target_file_path)
            
            # Create TensorDataset for input and target tensors
            input_dataset = TensorDataset(input_tensor)
            target_dataset = TensorDataset(target_tensor)
            
            # Append the datasets to the lists
            self.input_datasets.append(input_dataset)
            self.target_datasets.append(target_dataset)

        # Combine the individual datasets into a single dataset using ConcatDataset
        self.combined_input_dataset = ConcatDataset(self.input_datasets)
        self.combined_target_dataset = ConcatDataset(self.target_datasets)

    def __len__(self):
        return len(self.combined_input_dataset)

    def __getitem__(self, idx):
        return (
            self.combined_input_dataset[idx][0],  # Get the input tensor
            self.combined_target_dataset[idx][0]  # Get the target tensor
        )

def create_mapping(vocab_mask):
    mapping = {new_index: old_index for old_index, new_index in enumerate(vocab_mask)}
    return mapping

class MaskedConcatDataset(Dataset):
    def __init__(self, data_dir, mask):
        self.data_dir = data_dir
        self.input_datasets = []
        self.target_datasets = []
        if not isinstance(mask, set):
            raise ValueError("vocab_mask_set should be a set")
        self.vocab_mask = sorted(list(mask))
        self.mask_mapping = create_mapping(self.vocab_mask)
        file_ids = []
    
   
        all_files = os.listdir(data_dir)

        for file_name in all_files:
            if file_name.startswith("input") and file_name.endswith(".pt"):
                file_id = file_name[len("input"):-len(".pt")]
                target_file_name = f"target{file_id}.pt"

                # Check if there is a corresponding target file
                if target_file_name in all_files:
                    file_ids.append(int(file_id))
                    
        self.file_ids = file_ids
        
        for id in file_ids:
            input_file_path = os.path.join(data_dir, f'input{id}.pt')
            target_file_path = os.path.join(data_dir, f'target{id}.pt')
            
            # Load input and target tensors
            input_tensor = torch.load(input_file_path)
            target_tensor = torch.load(target_file_path)
            
            #mask em
            input_tensor_masked = input_tensor[:, :, self.vocab_mask]
            target_tensor_masked = target_tensor[:, self.vocab_mask]

            
            # Create TensorDataset for input and target tensors
            input_dataset = TensorDataset(input_tensor_masked)
            target_dataset = TensorDataset(target_tensor_masked)
            
            # Append the datasets to the lists
            self.input_datasets.append(input_dataset)
            self.target_datasets.append(target_dataset)

        # Combine the individual datasets into a single dataset using ConcatDataset
        self.combined_input_dataset = ConcatDataset(self.input_datasets)
        self.combined_target_dataset = ConcatDataset(self.target_datasets)

    def __len__(self):
        return len(self.combined_input_dataset)

    def __getitem__(self, idx):
        return (
            self.combined_input_dataset[idx][0],  # Get the input tensor
            self.combined_target_dataset[idx][0]  # Get the target tensor
        )




In [26]:
custom_dataset = CustomConcatDataset(data_dir)
masked_dataset = MaskedConcatDataset(data_dir,{0,1,2,3,4,5,10,11,12,13,14,15,16,17,18,23,24,25,26,46,57,68,79,133,144})
dataloader = DataLoader(custom_dataset, batch_size=3, shuffle=True)
masked_dataloader = DataLoader(masked_dataset, batch_size=3, shuffle=True)

In [20]:
for batch_idx, (input_batch, target_batch) in enumerate(dataloader):
    print(len(dataloader))
    print(f"Batch {batch_idx + 1}:")
    print("Input batch:")
    print(input_batch.shape,input_batch)
    print("Target batch:")
    print(target_batch.shape,target_batch)

9
Batch 1:
Input batch:
torch.Size([3, 512, 32000]) tensor([[[0.8867, 0.7114, 0.8491,  ..., 0.4878, 0.4133, 0.2976],
         [0.4265, 0.0823, 0.6201,  ..., 0.7935, 0.6177, 0.7500],
         [0.0869, 0.6675, 0.8687,  ..., 0.2700, 0.4075, 0.7529],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.0573, 0.7607, 0.7856,  ..., 0.6851, 0.1888, 0.7559],
         [0.5112, 0.7085, 0.1101,  ..., 0.1802, 0.4006, 0.8838],
         [0.5107, 0.2400, 0.8008,  ..., 0.0255, 0.4253, 0.8247],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.9985, 0.6440, 0.4163,  ..., 0.5381, 0.7275, 0.0883],
         [0.6777, 0.4272, 0.5869,  ..., 0.9263, 0.1285, 0.7510],
      

In [27]:
for batch_idx, (input_batch, target_batch) in enumerate(masked_dataloader):
    print(len(dataloader))
    print(f"Batch {batch_idx + 1}:")
    print("Input batch:")
    print(input_batch.shape,input_batch)
    print("Target batch:")
    print(target_batch.shape,target_batch)

9
Batch 1:
Input batch:
torch.Size([3, 512, 25]) tensor([[[0.9722, 0.4280, 0.2979,  ..., 0.9360, 0.6064, 0.9531],
         [0.8521, 0.6016, 0.7222,  ..., 0.5532, 0.8638, 0.6553],
         [0.2754, 0.5688, 0.1438,  ..., 0.9702, 0.4780, 0.3281],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.5879, 0.3330, 0.9839,  ..., 0.6460, 0.2025, 0.5830],
         [0.8115, 0.8628, 0.7817,  ..., 0.5820, 0.0269, 0.3716],
         [0.6631, 0.3469, 0.9180,  ..., 0.7759, 0.0476, 0.0280],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.8169, 0.8115, 0.7271,  ..., 0.1464, 0.2595, 0.4863],
         [0.6216, 0.7661, 0.9849,  ..., 0.8452, 0.1293, 0.5034],
         

In [None]:
#Tomorrow, add the models, one with sentence level embeddings and one normal.  make it so you can set hyperparameters and save from cmd line
