In [1]:
import torch
from torch.utils.data import Dataset
import json
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# class defenition for dataset
class CheeseDescriptionsDataset(Dataset):
    def __init__(self, annotation_file):
        self.annot_file = annotation_file
        self.df = self.load_data(self.annot_file)

    
    def load_data(self, annot_file):
        with open(annot_file, 'r', encoding='utf-8') as f:
            lines = json.load(f)
            task_input, task_output = [], []
            for file in lines:
                line = lines[file]
                for rhet_tag in line:
                    text = line[rhet_tag]['text']
                    slots = line[rhet_tag]['slots']
                    formatted_slot = ''
                    for slot_key, slot_value in slots.items():
                        formatted_slot+= '<'+slot_key+':'+slot_value+'>'
                    task_input.append(text)
                    task_output.append(formatted_slot)
        
        data = {'input':task_input, 'output':task_output}
        df = pd.DataFrame(data=data, columns=['input','output'])
        return df

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        # return specific items
        row = self.df.iloc[index]
        print(row['input'])
        print(row['output'])
        return row
    


In [10]:
class BertPipeline:
    def __init__(self,
                 tokenizer,
                 max_len_encoder,
                 max_len_decoder,
                 **kwargs):
        self.tokenizer = tokenizer
        self.max_len_encoder = max_len_encoder
        self.max_len_decoder = max_len_decoder

    def __call__(self, row):
        decoder_text = row['output']
        encoder_text = row['input']

        # prepare encoder inputs
        enc_tokens = self.tokenizer(encoder_text,
                                    max_length = self.max_len_encoder,
                                    padding = 'max_length',
                                    truncation = True)
        encoder_input_ids = enc_tokens['input_ids']
        encoder_attention_mask = [1 if x!=0 else 0 for x in encoder_input_ids]
        encoder_cross_attention_mask = [1 if x!=0 else 0 for x in encoder_attention_mask]

        # prepare decoder inputs
        dec_tokens = self.tokenizer(decoder_text,
                                    max_length = self.max_len_decoder,
                                    padding = 'max_length',
                                    truncation = True)
        
        decoder_input_ids = dec_tokens['input_ids']
        decoder_attention_mask = [1 if x!=0 else 0 for x in decoder_input_ids]

        # prepare the labels and target ids are shifted inside the decoder model forward pass
        decoder_target_ids = [x for x in decoder_input_ids]
        
        ds = {
            'input_ids': encoder_input_ids,
            'attention_mask': encoder_attention_mask,
            'cross_attention_mask': encoder_cross_attention_mask,
            'decoder_input_ids': decoder_input_ids,
            'decoder_attention_mask': decoder_attention_mask,
            'labels': decoder_target_ids
        }

        return ds

In [13]:
annot_file = '../Data/slots_data/rhet_data_slots_cleaned.json'
ds = CheeseDescriptionsDataset(annotation_file=annot_file)
ds.__getitem__(0)
ds.__getitem__(6)
ds.__getitem__(12)

Grandma Singletons Beacon Fell PDO Traditional Creamy Lancashire is a true gem of Lancashire, produced by the renowned cheese manufacturer Grandma Singletons. This cheese has a rich history and geographical provenance, as it is made using locally sourced milk from farms within a 12-mile radius.
<NameOfCheese:Grandma Singletons Beacon Fell PDO Traditional Creamy Lancashire><QualityOfRind:Creamy><AnimalSource:None><MainIngredient:None><MainIngredientWeight:None><NameOfCheesery:Grandma Singletons><NamesOfFacilities:None><ManufacturePlace:Lancashire><ManufacturedSinceDate:None><CheeseOriginatedDate:None><CheeseCreatorMovedFromPlace:None><CheeseCreatorMovedToPlace:None><CheeseNameFromObject:Beacon Fell>
Introducing Bonchester, a delectable British cheese that has been awarded with a Protected Designation of Origin (PDO). This artisan cheese is made in the border lands of England and Scotland, specifically in Bonchester Bridge, Roxburghshire. It was first developed in 1980 on the Easter Ween

input     Buxton Blue, a modern creamery blue cheese, is...
output    <NameOfCheese:Buxton Blue><QualityOfRind:moder...
Name: 12, dtype: object

In [2]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-1.3B',
                                              bos_token='<|startoftext|>',
                                              eos_token='<|endoftext|>',
                                              pad_token='<|pad|>',
                                              sep_token='<|sep|>')
tokenizer.all_special_tokens

  from .autonotebook import tqdm as notebook_tqdm


['<|startoftext|>', '<|endoftext|>', '<|sep|>', '<|pad|>']

In [19]:
# Rough Work
encoder_text = 'hello world'
enc_tokens = tokenizer(encoder_text,
                    max_length = 1024,
                    padding = 'max_length',
                    truncation = True)
print(enc_tokens)
print(tokenizer.decode(enc_tokens['input_ids']))


{'input_ids': [31373, 995, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 

In [4]:
print(tokenizer.bos_token)

<|startoftext|>


In [3]:
tokenizer.tokenize('hello world jjj')
test_tokens = [[50257,    27,  5376,  5189,  7376,  2771,    25, 10462,  3021,  1000,
         27601,  6927, 35013,  5189,    49,   521,    25, 36380,    11, 40973,
          6927, 40002,  7416,    25, 14202,  6927, 13383, 27682,   445,  1153,
            25, 14202,  6927, 13383, 27682,   445,  1153, 25844,    25, 14202,
          6927,  5376,  5189,  7376,   274,  1924,    25,    49,  2308,  1641,
          6927, 36690,  5189, 47522,  2410,    25, 14202,  6927, 44445,   495,
         27271,    25, 14868,  6327,    11, 31157,  6927, 44445,  1522,  6385,
         10430,    25,  1157,   400,  4289,  6927,  7376,  2771, 11610,  3898,
         10430,    25,  1157,   400,  4289,  6927,  7376,  2771, 16719,   273,
            44,  2668,  4863, 27271,    25, 14202,  6927,  7376,  2771, 16719,
           273,    44,  2668,  2514, 27271,    25, 10462,  3021,  1000,  6927,
          7376,  2771,  5376,  4863, 10267,    25,    34,  1694,    66,   666,
         27646,    29, 50258]]
tokenizer.batch_decode(test_tokens)

['<|startoftext|><NameOfCheese:Swaledale Cheese><QualityOfRind:traditional, handmade><AnimalSource:None><MainIngredient:None><MainIngredientWeight:None><NameOfCheesery:Reed family><NamesOfFacilities:None><ManufacturePlace:Richmond, Yorkshire><ManufacturedSinceDate:11th century><CheeseOriginatedDate:11th century><CheeseCreatorMovedFromPlace:None><CheeseCreatorMovedToPlace:Swaledale><CheeseNameFromObject:Cistercian monks><|sep|>']

In [21]:
print(tokenizer.pad_token)
from transformers import GPTNeoForCausalLM
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
print(model.config)

<|pad|>


In [2]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print(tokenizer.vocab_size)

30522


In [48]:
from transformers import BertTokenizer

txt= 'Hello world'
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',lowercase=True)
pipeline = BertPipeline(tokenizer, 512, 512)
row = ds.__getitem__(0)
pipeline.bert_pipeline(row)



{'input_ids': [101, 13055, 28159, 2015, 14400, 3062, 22851, 2080, 3151, 24519, 9638, 2003, 1037, 2995, 17070, 1997, 9638, 1010, 2550, 2011, 1996, 8228, 8808, 7751, 13055, 28159, 2015, 1012, 2023, 8808, 2038, 1037, 4138, 2381, 1998, 10056, 10003, 6651, 1010, 2004, 2009, 2003, 2081, 2478, 7246, 23184, 6501, 2013, 8623, 2306, 1037, 2260, 1011, 3542, 12177, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 