In [24]:
#Dataset link
import numpy as np
import pandas as pd

from datasets import load_dataset, Dataset, DatasetDict
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import pandas as pd

import os
#for dirname, _, filenames in os.walk('../Datasets'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))


In [34]:
dataset_v2_path = "../Datasets/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json"
df = pd.read_json(dataset_v2_path, lines=True)
print(df.head())

dataset_hf=load_dataset("json", data_files=dataset_v2_path)
dataset_hf=dataset_hf.remove_columns(['article_link'])
dataset_hf.set_format('pandas')
dataset_hf=dataset_hf['train'][:]
dataset_hf.drop_duplicates(subset=['headline'],inplace=True)
dataset_hf=dataset_hf.reset_index()[['headline','label']]
dataset_hf=Dataset.from_pandas(dataset_hf)

   label                                           headline  \
0      1  thirtysomething scientists unveil doomsday clo...   
1      0  dem rep. totally nails why congress is falling...   
2      0  eat your veggies: 9 deliciously different recipes   
3      1  inclement weather prevents liar from getting t...   
4      1  mother comes pretty close to using word 'strea...   

                                        article_link  
0  https://www.theonion.com/thirtysomething-scien...  
1  https://www.huffingtonpost.com/entry/donna-edw...  
2  https://www.huffingtonpost.com/entry/eat-your-...  
3  https://local.theonion.com/inclement-weather-p...  
4  https://www.theonion.com/mother-comes-pretty-c...  


Using custom data configuration default-026d8913a6176efd


Downloading and preparing dataset json/default to /Users/test/.cache/huggingface/datasets/json/default-026d8913a6176efd/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /Users/test/.cache/huggingface/datasets/json/default-026d8913a6176efd/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [35]:
# Train Test Valid Split
train_testvalid = dataset_hf.train_test_split(test_size=0.2,seed=15)


test_valid = train_testvalid['test'].train_test_split(test_size=0.5,seed=15)

dataset_hf = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

print(dataset_hf)

DatasetDict({
    train: Dataset({
        features: ['headline', 'label'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'label'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'label'],
        num_rows: 2850
    })
})


In [43]:
#Checkpoint
checkpoint = "distilbert-base-uncased"
#In the model distilbert-base-uncased
# #each token is embedded into a vector of size 768. 
# The shape of the output from the base model is
# (batch_size, max_sequence_length, embedding_vector_size=768)

#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.model_max_len = 512

def tokenize(batch):
  return tokenizer(batch["headline"], truncation=True, max_length=512)

tokenized_dataset = dataset_hf.map(tokenize, batched=True)
print(tokenized_dataset)
tokenized_dataset.set_format('torch', columns=["input_ids", "attention_mask", "label"] )

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# https://huggingface.co/docs/transformers/main_classes/data_collator
# Data collators are objects that will form a batch by using a list of dataset elements as input. 
# These elements are of the same type as the elements of train_dataset or eval_dataset.
# To be able to build batches, data collators may apply some processing (like padding). 
#  Some of them (like DataCollatorForLanguageModeling) also apply some random data augmentation 
# (like random masking) on the formed batch.
# data_collator automatically pads the model inputs in a batch to the length of the longest example.
# This bypasses the need to set a global maximum sequence length, 
# and in practice leads to faster training since we perform fewer redundant computations 
# on the padded tokens and attention masks.

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2850
    })
})


In [47]:
class MyTaskSpecificCustomModel(nn.Module):
    def __init__(self, checkpoint, num_labels ):
        super(MyTaskSpecificCustomModel, self).__init__()
        self.num_labels = num_labels
        
        self.model = model = AutoModel.from_pretrained(checkpoint, config = AutoConfig.from_pretrained(checkpoint, 
                                                                                                       output_attention = True, 
                                                                                                       output_hidden_state = True ) )
        # New Layer
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_labels )
        
    def forward(self, input_ids = None, attention_mask=None, labels = None ):
        #If the attention_mask is 0, the token id is ignored. 
        # For instance if a sequence is padded to adjust the sequence length, 
        # the padded words should be ignored hence their attention_mask are 0.
        outputs = self.model(input_ids = input_ids, attention_mask = attention_mask  )
        
        last_hidden_state = outputs[0]
        
        sequence_outputs = self.dropout(last_hidden_state)
        
        logits = self.classifier(sequence_outputs[:, 0, : ].view(-1, 768 ))
        
        loss = None
        if labels is not None:
            loss_func = nn.CrossEntropyLoss()
            loss = loss_func(logits.view(-1, self.num_labels), labels.view(-1))
            
            return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

Making sense of nn.Linear
In your Neural Network, the self.hidden = nn.Linear(784, 256) defines a hidden (meaning that it is in between of the input and output layers), fully connected linear layer, 
which takes input x of shape (batch_size, 784), where batch size is the number of inputs (each of size 784) which are passed to the network at once (as a single tensor), 
and transforms it by the linear equation y = x*W^T + b into a tensor y of shape (batch_size, 256)

In [48]:
#PyTorch Data Loader
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset['train'], shuffle = True, batch_size = 32, collate_fn = data_collator
)

eval_dataloader = DataLoader(
    tokenized_dataset['valid'], shuffle = True, collate_fn = data_collator
)

In [51]:
#training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_task_specific = MyTaskSpecificCustomModel(checkpoint=checkpoint, num_labels=2 ).to(device)
from transformers import AdamW, get_scheduler

optimizer = AdamW(model_task_specific.parameters(), lr = 5e-5 )

num_epoch = 3

num_training_steps = num_epoch * len(train_dataloader)

lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps,
    
)
from datasets import load_metric
metric = load_metric("f1")

from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epoch * len(eval_dataloader) ))


for epoch in range(num_epoch):
    model_task_specific.train()
    for batch in train_dataloader:
        batch = { k: v.to(device) for k, v in batch.items() }
        #print(batch)
        outputs = model_task_specific(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)
        
    model_task_specific.eval()
    for batch in eval_dataloader:
        batch = { k: v.to(device) for k, v in batch.items() }
        with torch.no_grad():
            outputs = model_task_specific(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim = -1 )
        metric.add_batch(predictions = predictions, references = batch['labels'] )
        progress_bar_eval.update(1)
        
    print(metric.compute()) 
       

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2139 [00:00<?, ?it/s]

  0%|          | 0/8550 [00:00<?, ?it/s]

{'f1': 0.909952606635071}
{'f1': 0.9244274809160304}
{'f1': 0.9266975308641977}


In [63]:
#Post training evaluation
model_task_specific.eval()

test_dataloader = DataLoader(
    tokenized_dataset['test'], batch_size = 32, collate_fn = data_collator
)


for batch in test_dataloader:
    batch = { k: v.to(device) for k, v in batch.items() }
    with torch.no_grad():
        outputs = model_task_specific(**batch)
        
    logits = outputs.logits
    predictions = torch.argmax(logits, dim = -1)
    #metric.add_batch(predictons = predictions, references=batch['labels'] )
    metric.add_batch(predictons = predictions, references=batch['labels'] )
    
metric.compute()  

ValueError: Bad inputs for metric: ['predictons']. All required inputs are ['predictions', 'references']