**Installing Libraries**

In [0]:
!pip install transformers
!pip install jsonlines

**Importing Necessary Libraries**

In [0]:
import jsonlines  
import pandas as pd
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig

import numpy as np # linear algebra
from pathlib import Path 
import os
import torch
import torch.optim as optim
import random 

# fastai
from fastai import *
from fastai.text import *
from fastai.callbacks import *
from transformers import AdamW
from functools import partial

**Reading Data from .csv files**

In [0]:
#training
data_twitter = []
path = '/content/drive/My Drive/Sarcasm shared task/sarcasm/twitter/sarcasm_detection_shared_task_twitter_training.jsonl'
with jsonlines.open(path) as reader:
    for obj in reader:
        line = []
        line.append(obj['label'])
        line.append(obj['response'])
        line.append(obj['context'])
        data_twitter.append(line)

data_twitter = pd.DataFrame(data_twitter, columns=['label', 'response', 'context'])
data_twitter.head()

In [0]:
#testing
data_twitter_test = []
path = '/content/drive/My Drive/Sarcasm shared task/sarcasm/twitter/sarcasm_detection_shared_task_twitter_testing.jsonl'
with jsonlines.open(path) as reader:
    for obj in reader:
        line = []
        line.append(obj['id'])
        line.append(obj['response'])
        line.append(obj['context'])
        data_twitter_test.append(line)

data_twitter_test = pd.DataFrame(data_twitter_test, columns=['id','response', 'context'])
data_twitter_test.head() 

**Storing last sentence and second-last sentence of context sets separately in different columns**

In [0]:
d1 = []
d2 = []
for i in range(len(data_twitter)):
  d1.append(data_twitter['context'][i][-2])
  d2.append(data_twitter['context'][i][-1])

data_twitter['d1'] = d1
data_twitter['d2'] = d2

data_twitter.head()

In [0]:
d1 = []
d2 = []
for i in range(len(data_twitter_test)):
  d1.append(data_twitter_test['context'][i][-2])
  d2.append(data_twitter_test['context'][i][-1])

data_twitter_test['d1'] = d1
data_twitter_test['d2'] = d2

data_twitter_test.head()

**Choosing model architecture**

In [0]:
MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig)}
    
model_type = 'bert'

model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]

In [0]:
model_class.pretrained_model_archive_map.keys()

In [0]:
pretrained_model_name='bert-base-uncased'

**Fixing seeds for reproducibility**

In [0]:
seed = 42
use_fp16 = True #For half-precision calculation
bs = 16

def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

seed_all(seed)

**Defining Tokenizer**

In [0]:
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig

class TransformersBaseTokenizer(BaseTokenizer):
    """Wrapper around PreTrainedTokenizer to be compatible with fast.ai"""
    def __init__(self, pretrained_tokenizer: PreTrainedTokenizer, model_type = 'bert', **kwargs):
        self._pretrained_tokenizer = pretrained_tokenizer
        self.max_seq_len = pretrained_tokenizer.max_len
        self.model_type = model_type

    def __call__(self, *args, **kwargs): 
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length and add the spesial tokens"""
        CLS = self._pretrained_tokenizer.cls_token
        SEP = self._pretrained_tokenizer.sep_token
        if self.model_type in ['roberta']:
            tokens = self._pretrained_tokenizer.tokenize(t, add_prefix_space=True)[:self.max_seq_len - 2]
            tokens = [CLS] + tokens + [SEP]
        else:
            tokens = self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2]
            if self.model_type in ['xlnet']:
                tokens = tokens + [SEP] +  [CLS]
            else:
                tokens = [CLS] + tokens + [SEP]
        return tokens       

In [0]:
transformer_tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)
transformer_base_tokenizer = TransformersBaseTokenizer(pretrained_tokenizer = transformer_tokenizer, model_type = model_type)
fastai_tokenizer = Tokenizer(tok_func = transformer_base_tokenizer, pre_rules=[], post_rules=[]) 

**Defining Vocabulary making function. Required for Fastai.**

In [0]:
class TransformersVocab(Vocab):
    def __init__(self, tokenizer: PreTrainedTokenizer):
        super(TransformersVocab, self).__init__(itos = [])
        self.tokenizer = tokenizer
    
    def numericalize(self, t:Collection[str]) -> List[int]:
        "Convert a list of tokens `t` to their ids."
        return self.tokenizer.convert_tokens_to_ids(t)
        #return self.tokenizer.encode(t)

    def textify(self, nums:Collection[int], sep=' ') -> List[str]:
        "Convert a list of `nums` to their tokens."
        nums = np.array(nums).tolist()
        return sep.join(self.tokenizer.convert_ids_to_tokens(nums)) if sep is not None else self.tokenizer.convert_ids_to_tokens(nums)
    
    def __getstate__(self):
        return {'itos':self.itos, 'tokenizer':self.tokenizer}

    def __setstate__(self, state:dict):
        self.itos = state['itos']
        self.tokenizer = state['tokenizer']
        self.stoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.itos)})

In [0]:
transformer_vocab =  TransformersVocab(tokenizer = transformer_tokenizer)
numericalize_processor = NumericalizeProcessor(vocab=transformer_vocab)
tokenize_processor = TokenizeProcessor(tokenizer=fastai_tokenizer, include_bos=False, include_eos=False)
transformer_processor = [tokenize_processor, numericalize_processor]

**Defining how to pad the text (depends on architecture to be used)**

In [0]:
pad_first = bool(model_type in ['xlnet'])
pad_idx = transformer_tokenizer.pad_token_id

**Creating databunch for the response sentences**

In [0]:
databunch = (TextList.from_df(data_twitter, cols='response', processor=transformer_processor)
             .split_by_rand_pct(0.1,seed=seed)
             .label_from_df(cols= 'label')
             .databunch(bs=bs, pad_first=pad_first, pad_idx=pad_idx))

In [0]:
print('[CLS] token :', transformer_tokenizer.cls_token)
print('[SEP] token :', transformer_tokenizer.sep_token)
print('[PAD] token :', transformer_tokenizer.pad_token)
databunch.show_batch()

**Defining the model architecture**

In [0]:
class CustomTransformerModel(nn.Module):
    def __init__(self, transformer_model: PreTrainedModel):
        super(CustomTransformerModel,self).__init__()
        self.transformer = transformer_model
        
    def forward(self, input_ids, attention_mask=None):
        
        # attention_mask
        # Mask to avoid performing attention on padding token indices.
        # Mask values selected in ``[0, 1]``:
        # ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        attention_mask = (input_ids!=pad_idx).type(input_ids.type()) 
        
        logits = self.transformer(input_ids,
                                  attention_mask = attention_mask)[0]   
        return logits

In [0]:
config = config_class.from_pretrained(pretrained_model_name)
transformer_model = model_class.from_pretrained(pretrained_model_name, config = config)

custom_transformer_model = CustomTransformerModel(transformer_model = transformer_model)

**Defining Learner**

In [0]:
CustomAdamW = partial(AdamW, correct_bias=False)

learner = Learner(databunch, 
                  custom_transformer_model, 
                  opt_func = CustomAdamW, 
                  metrics=[accuracy, error_rate])

# Show graph of learner stats and metrics after each epoch.
learner.callbacks.append(ShowGraph(learner))

**Splitting Model into smaller groups**

In [0]:
list_layers = [learner.model.transformer.bert.embeddings,
              learner.model.transformer.bert.encoder.layer[0],
              learner.model.transformer.bert.encoder.layer[1],
              learner.model.transformer.bert.encoder.layer[2],
              learner.model.transformer.bert.encoder.layer[3],
              learner.model.transformer.bert.encoder.layer[4],
              learner.model.transformer.bert.encoder.layer[5],
              learner.model.transformer.bert.encoder.layer[6],
              learner.model.transformer.bert.encoder.layer[7],
              learner.model.transformer.bert.encoder.layer[8],
              learner.model.transformer.bert.encoder.layer[9],
              learner.model.transformer.bert.encoder.layer[10],
              learner.model.transformer.bert.encoder.layer[11],
              learner.model.transformer.bert.pooler]

In [0]:
learner.split(list_layers)
num_groups = len(learner.layer_groups)
print('Learner split in',num_groups,'groups')
print(learner.layer_groups)

**Training the Model by gradually unfreezing end layers (groups)**

In [0]:
seed_all(seed)
learner.freeze_to(-1)
learner.summary()

In [0]:
#Finding optimal learning rate for slanted triangular learning rates policy
learner.lr_find()
learner.recorder.plot(skip_end=10,suggestion=True)

In [0]:
learner.fit_one_cycle(2,max_lr=1e-04,moms=(0.8,0.7), callbacks=[SaveModelCallback(learner, monitor='accuracy')])

In [0]:
learner.load('bestmodel')
learner.save('twitter-response-bert')
seed_all(seed)

In [0]:
learner.freeze_to(-2)
learner.lr_find()
learner.recorder.plot(skip_end=10,suggestion=True)

In [0]:
lr = 1e-5
learner.fit_one_cycle(2, max_lr=slice(lr*0.95**num_groups, lr), moms=(0.8, 0.9), callbacks=[SaveModelCallback(learner, monitor='accuracy')])

In [0]:
seed_all(seed)
learner.load('bestmodel');
learner.freeze_to(-3)

In [0]:
learner.fit_one_cycle(2, max_lr=slice(lr*0.95**num_groups, lr), moms=(0.8, 0.9), callbacks=[SaveModelCallback(learner, monitor='accuracy')])

In [0]:
learner.load('bestmodel');
learner.save('twitter-response-bert')
seed_all(seed)

In [0]:
lr = 1e-5
learner.unfreeze()
learner.fit_one_cycle(4, max_lr=slice(lr*0.95**num_groups, lr), moms=(0.8, 0.9), callbacks=[SaveModelCallback(learner, monitor='valid_loss')])

In [0]:
learner.load('bestmodel')
learner.save('twitter best yet response')

**Saving probability values for train and test responses**

In [0]:
pred_response = []
learner.load('twitter best yet response')
for i in tqdm(range(len(data_twitter))):
  pred_response.append(float(learner.predict(data_twitter['response'][i])[2][1]))

In [0]:
pred_response_test = []
for i in tqdm(range(len(data_twitter_test))):
  pred_response_test.append(float(learner.predict(data_twitter_test['response'][i])[2][1]))

**Creating databunch for last sentence of context sets** 

In [0]:
databunch = (TextList.from_df(data_twitter, cols='d2', processor=transformer_processor)
             .split_by_rand_pct(0.1,seed=seed)
             .label_from_df(cols= 'label')
             .databunch(bs=bs, pad_first=pad_first, pad_idx=pad_idx))

**Perform the same model training procedure as done while training on response sentences. Then record the probability values of train and test data last context sentences.**

In [0]:
pred_d2 = []
learner.load('twitter best yet d2')
for i in tqdm(range(len(data_twitter))):
  pred_d2.append(float(learner.predict(data_twitter['d2'][i])[2][1]))

In [0]:
pred_d2_test = []
for i in range(len(data_twitter_test)):
  pred_d2_test.append(float(learner.predict(data_twitter_test['d2'][i])[2][1]))

**Creating databunch for second last sentence of context sets** 

In [0]:
databunch = (TextList.from_df(data_twitter, cols='d1', processor=transformer_processor)
             .split_by_rand_pct(0.1,seed=seed)
             .label_from_df(cols= 'label')
             .databunch(bs=bs, pad_first=pad_first, pad_idx=pad_idx))

**Perform the same model training procedure as done while training on response sentences. Then record the probability values of train and test data second last context sentences.**

In [0]:
pred_d1 = []
learner.load('twitter best yet d1')
for i in tqdm(range(len(data_twitter))):
  pred_d1.append(float(learner.predict(data_twitter['d1'][i])[2][1]))

In [0]:
pred_d1_test = []
for i in range(len(data_twitter_test)):
  pred_d1_test.append(float(learner.predict(data_twitter_test['d1'][i])[2][1]))

**Saving all train probability values together with their labels in a csv file**

In [0]:
df = pd.DataFrame([], columns=['d1', 'd2', 'response'])
df['d1'] = pred_d1
df['d2'] = pred_d2
df['response'] = pred_response

y = []
for i in range(len(data_twitter)):
  if(data_twitter['label'][i] == 'NOT_SARCASM'):
    y.append(0)
  else:
    y.append(1)

df['y'] = y
 
df.head()

In [0]:
df.to_csv('pred_values.csv', index=False)

**Saving all test probability values together with their labels in a csv file**

In [0]:
df = pd.DataFrame([], columns=['id', 'd1', 'd2', 'response'])
df['id'] = data_twitter_test['id']
df['d1'] = pred_d1_test
df['d2'] = pred_d2_test
df['response'] = pred_response_test
df.head()

In [0]:
df.to_csv('pred_values_test.csv', index=False)