In [1]:
import json
import random
import torch
import torchtext
import tqdm

In [None]:
# load the json file
with open('data/sarcasm.json', 'r') as f:
    file = json.load(f)
file

In [3]:
print(type(file))
print(len(file))
print(file[0].keys())

rand_ind = random.randint(0, len(file))

label_mapping = {0: 'non-sarcastic',
                 1: 'sarcastic'}
# print first example
print(f'\nRunning {rand_ind} example in the datatset')
print(f'The headline: {file[rand_ind]["headline"]}')
print(f'The label of fist example: {label_mapping[file[rand_ind]["is_sarcastic"]]}')
print(f'article link: {file[rand_ind]["article_link"]}')

<class 'list'>
26709
dict_keys(['article_link', 'headline', 'is_sarcastic'])

Running 1135 example in the datatset
The headline: things come apart so easily: asghar farhadi's about elly
The label of fist example: non-sarcastic
article link: https://www.huffingtonpost.com/entry/things-come-apart-so-easi_b_7023922.html


In [4]:
# randomize the contents in teh list
random.shuffle(file)

In [5]:
# split the dataset into to parts i.e. test and train
train_length = int(len(file) * 0.8)
training_examples = file[:train_length]
testing_examples = file[train_length:] 

In [18]:
class sarcasticDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)
    
    def map(self, tokenization_function, fn_kwargs = {}):
        # Apply the provided tokenization function to each example in the dataset
        tokenized_data = [tokenization_function(example, **fn_kwargs) for example in self.dataset]
        return tokenized_data
    
    def __getitem__(self, indx):  
        if indx in range(0, len(self.dataset)):
            sample = self.dataset[indx]
            headline = sample["headline"]
            label = sample["is_sarcastic"]
            return (headline, label)
        else:
            raise  ValueError(f"index {indx} is not in the range [{0}, {len(self.dataset)}]")




In [19]:
test_data = sarcasticDataset(testing_examples)
train_data = sarcasticDataset(training_examples)

In [20]:
test_loader = torch.utils.data.DataLoader(test_data, batch_size= 4, shuffle= False, 
                                     pin_memory = True, drop_last = True)


train_loader = torch.utils.data.DataLoader(train_data, batch_size= 4, shuffle= True, 
                                     pin_memory = True, drop_last = True)

In [21]:
print(len(train_loader))
print(len(test_loader))

5341
1335


In [22]:
headlines,labels = next(iter(test_loader))

In [23]:
headlines, labels

(("why south sudan's leaders are fueling the implosion of their own country",
  'bernie sanders just tweeted the most evergreen response to cbo score',
  'drug addict looking for more enabling girlfriend',
  'spanish government threatens to impose direct rule in catalonia'),
 tensor([0, 0, 1, 0]))

In [24]:
# Next Step: convert the text to the numerical representation 

seed = 0

torch.manual_seed(seed)

<torch._C.Generator at 0x26afff59310>

In [25]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [26]:

def tokenize_data(example, tokenizer, max_length):
    tokens = tokenizer(example['headline'])[:max_length]
    label = example["is_sarcastic"]
    length = len(tokens)
    return {'tokens': tokens, 'label': label, 'length': length}

In [27]:
# tokenization illustration
max_length = 256
example = file[0]
print(f'original text: {file[0]["headline"]}')
output = tokenize_data(example, tokenizer, max_length)
print(output)

original text: 10 eating habits linked to dying from cardiovascular disease and diabetes
{'tokens': ['10', 'eating', 'habits', 'linked', 'to', 'dying', 'from', 'cardiovascular', 'disease', 'and', 'diabetes'], 'label': 0, 'length': 11}


In [28]:

train_data = train_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer, 'max_length': max_length})
test_data = test_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer, 'max_length': max_length})

In [29]:
test_loader = torch.utils.data.DataLoader(test_data, batch_size= 4, shuffle= False, 
                                     pin_memory = True, drop_last = True)


train_loader = torch.utils.data.DataLoader(train_data, batch_size= 4, shuffle= True, 
                                     pin_memory = True, drop_last = True)

In [33]:
output = next(iter(test_loader))

RuntimeError: each element in list of batch should be of equal size

In [35]:
test_data

[{'tokens': ['why',
   'south',
   'sudan',
   "'",
   's',
   'leaders',
   'are',
   'fueling',
   'the',
   'implosion',
   'of',
   'their',
   'own',
   'country'],
  'label': 0,
  'length': 14},
 {'tokens': ['bernie',
   'sanders',
   'just',
   'tweeted',
   'the',
   'most',
   'evergreen',
   'response',
   'to',
   'cbo',
   'score'],
  'label': 0,
  'length': 11},
 {'tokens': ['drug',
   'addict',
   'looking',
   'for',
   'more',
   'enabling',
   'girlfriend'],
  'label': 1,
  'length': 7},
 {'tokens': ['spanish',
   'government',
   'threatens',
   'to',
   'impose',
   'direct',
   'rule',
   'in',
   'catalonia'],
  'label': 0,
  'length': 9},
 {'tokens': ['sea',
   'lion',
   'yanks',
   'man',
   'off',
   'boat',
   'in',
   'effort',
   'to',
   'snatch',
   'fish'],
  'label': 0,
  'length': 11},
 {'tokens': ['report',
   'getting',
   'out',
   'of',
   'bed',
   'in',
   'morning',
   'sharply',
   'increases',
   'risk',
   'of',
   'things',
   'getting',
   '