#### Clickbait Detection on Headline's text & Article's text using Deberta and Electra pre-trained Models and their Tokenizers

In [1]:
from sklearn.metrics import f1_score, balanced_accuracy_score, mean_squared_error, confusion_matrix, recall_score, precision_score, accuracy_score, log_loss
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig,Trainer, TrainingArguments, BertweetTokenizer, BertConfig
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import random
import torch
import json
import os
import re
import gc

In [2]:
path = './Webis17HeadlineArticleClsData/'  # Webis17HeadlineClsData or Webis17HeadlineArticleClsData
root = "./"

In [3]:
df_train = pd.read_csv(path+'train.csv')
df_validation = pd.read_csv(path+'validation.csv')
df_test = pd.read_csv(path+'test.csv')

In [4]:
df_train.head()

Unnamed: 0,postText,targetParagraphs,truthClass
0,UK’s response to modern slavery leaving victim...,Thousands of modern slavery victims have not c...,no-clickbait
1,this is good,President Donald Trump has appointed the pro-l...,clickbait
2,"The ""forgotten"" Trump roast: Relive his brutal...",When the White House correspondents’ dinner is...,no-clickbait
3,Meet the happiest #dog in the world!,Adorable is probably an understatement. This a...,clickbait
4,Tokyo's subway is shut down amid fears over an...,One of Tokyo's major subways systems says it s...,no-clickbait


In [5]:
len(df_train), len(df_validation), len(df_test)

(19538, 2459, 18979)

In [6]:
def normalize_sentence(sentence, replace_url=False, replace_usr=False):
    if not replace_url and not replace_usr:
        #print(replace_url,replace_usr)
        return sentence
    new_sentence = []
    for word in sentence.split(" "):
        # print(word)
        # @mentions, @users
        if (word.startswith("@") or word.startswith('"@')) and replace_usr:
            new_sentence.append('@user')
        # URL: https, http
        elif (word.startswith("http:") or word.startswith("https:")) and replace_url:
            new_sentence.append('internet_site')
        else:
            new_sentence.append(word)
    # remove extra " "
    new_sentence = re.sub(' +', ' ', " ".join(new_sentence))
    return new_sentence.strip()

In [7]:
def get_dataset(df_train,df_dev,df_test,root,replace_url=False,replace_usr=False):
    trainDataset = df_train[['postText', 'targetParagraphs', 'truthClass']].copy()
    trainDataset['x'] = trainDataset.postText.astype(str).apply(lambda a: normalize_sentence(a,replace_url,replace_usr))
    trainDataset['x1'] = trainDataset.targetParagraphs.astype(str).apply(lambda a: normalize_sentence(a,replace_url,replace_usr))
    trainDataset['y'] = trainDataset.truthClass.str.strip()
    print('Total amount of train',len(trainDataset.index))

    validationDataset = df_dev[['postText', 'targetParagraphs', 'truthClass']].copy()
    validationDataset['x'] = validationDataset.postText.astype(str).apply(lambda a: normalize_sentence(a,replace_url,replace_usr))
    validationDataset['x1'] = trainDataset.targetParagraphs.astype(str).apply(lambda a: normalize_sentence(a,replace_url,replace_usr))
    validationDataset['y'] = validationDataset.truthClass.str.strip()
    print('Total amount of validation',len(validationDataset.index))

    testDataset = df_test[['postText', 'targetParagraphs', 'truthClass']].copy()
    testDataset['x'] = testDataset.postText.astype(str).apply(lambda a: normalize_sentence(a,replace_url,replace_usr))
    testDataset['x1'] = trainDataset.targetParagraphs.astype(str).apply(lambda a: normalize_sentence(a,replace_url,replace_usr))
    testDataset['y'] = testDataset.truthClass.str.strip()
    print('Total amount of test',len(testDataset.index))

    return trainDataset, validationDataset, testDataset

trainDataset, validationDataset, testDataset = get_dataset(df_train, df_validation, df_test, ".", True, True)

labels = list(set(trainDataset.truthClass.tolist()))
labels.sort(key=lambda item: (-len(item), item), reverse=True)
nLabels = len(labels)
print('Labels:', labels)

trainDataset = trainDataset.drop(['postText', 'targetParagraphs', 'truthClass'], axis=1)
validationDataset = validationDataset.drop(['postText', 'targetParagraphs', 'truthClass'], axis=1)
testDataset = testDataset.drop(['postText', 'targetParagraphs', 'truthClass'], axis=1)

Total amount of train 19538
Total amount of validation 2459
Total amount of test 18979
Labels: ['clickbait', 'no-clickbait']


In [8]:
print(testDataset.head())

                                                   x  \
0  Johnny Manziel on Browns' No. 1 pick Myles Gar...   
1  Fabio: California Is a 'Mess' Because of Liber...   
2            "He's been huge for us this year, man."   
3  New Bears quarterback Mitchell Trubisky was gr...   
4  It's not enough to let employees work flexible...   

                                                  x1             y  
0  Thousands of modern slavery victims have not c...  no-clickbait  
1  President Donald Trump has appointed the pro-l...  no-clickbait  
2  When the White House correspondents’ dinner is...     clickbait  
3  Adorable is probably an understatement. This a...  no-clickbait  
4  One of Tokyo's major subways systems says it s...  no-clickbait  


In [9]:
class Trainer(Trainer):
    def evaluate(self, eval_dataset= None, ignore_keys=None):
        outputMetrics = super().evaluate(eval_dataset, ignore_keys)
#         print('outputMetrics: ', outputMetrics)
        return outputMetrics

In [10]:
class gn_dataset(Dataset):
    
    def __init__(self,data,labels,tokenizer):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer

    def processText(self, text, text_pair):
        tokenized = self.tokenizer(text, text_pair, truncation=True)

        return tokenized

    def __len__(self):
        return len(self.data.index)

    def __getitem__(self,i):
        row = self.data.iloc[i]
        x = self.processText(row['x'], row['x1']).data
        
        try:
            y = self.labels.index(row['y'])
        except:
            y = len(self.labels) - 1 

        x['labels'] = y
        return x

    def randomItem(self):
        return self.__getitem__(random.randint(0,self.__len__()))

In [11]:
all_computeMetrics = []
test_metrics = []

In [12]:
class Model:
    def __init__(self,modelPath = '',nLabels = 3, labels=None):
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") # put cpu for more extensive error desc...
        print('device: ', self.device)
        self.maxLength = 128  # max 512

        self.nLabels = nLabels

        self.loadModel(modelPath) 
        print('** Model: ', self.loadModel)

        self.labels = labels

    def model_init(self, dropout = 0.1):
        config = AutoConfig.from_pretrained(
                    self.MODEL_PATH,
                    num_labels=self.nLabels,
                    return_dict = True,
                    hidden_dropout_prob = dropout
                )

        print('configconfig', config)
        return AutoModelForSequenceClassification.from_pretrained(self.MODEL_PATH,config=config).to(self.device)

    def computeMetrics(self,evalPrediction):
        yPred = evalPrediction.predictions.argmax(1)
        yTrue = evalPrediction.label_ids

        metrics = {}
        
        metrics['val accuracy'] = accuracy_score(yTrue, yPred)
        metrics['f1'] = f1_score(yTrue, yPred)

        # original paper: https://link.springer.com/chapter/10.1007/978-3-319-30671-1_72/tables/2 / ROC-AUC, Precision, Recall
#         metrics['balanced_accuracy'] = balanced_accuracy_score(yTrue, yPred) # deal with imbalanced datasets
#         metrics['f1_macro'] = f1_score(yTrue, yPred, average='macro') # Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
#         metrics['mean_squared_error'] = mean_squared_error(yTrue, yPred)
#         if 'click' in self.labels[0]:
#             indexes = [i for i,x in enumerate(self.labels) if x == 'clickbait']
#             metrics['f1_binary'] = f1_score(yTrue, yPred, average='binary',pos_label=indexes[0])
#         metrics['confusion_matrix'] = str(confusion_matrix(yTrue, yPred))
#         tn, fp, fn, tp = confusion_matrix(yTrue, yPred).ravel()
#         recall = recall_score(yTrue, yPred) 
#         precision = precision_score(yTrue, yPred)
#         metrics['precision'] = precision
#         metrics['recall'] = recall
#         metrics['log_loss'] = log_loss(yTrue, yPred)
        
        all_computeMetrics.append(metrics)
        
        return metrics

    def loadModel(self,modelPath):
        self.MODEL_PATH = modelPath
        self.MODEL = self.model_init()
        self.TOKENIZER = AutoTokenizer.from_pretrained(self.MODEL_PATH,model_max_length = self.maxLength, use_fast=True)
        
    def saveModel(self,saveDir):
        print('saving model at: ', saveDir)
        self.MODEL.save_pretrained(saveDir)
        self.TOKENIZER.save_pretrained(saveDir)

    def train_loop(self,saveDir,checkpointDir,trainingData,validationData,testData,labels):
        trainDataset = gn_dataset(trainingData,labels,self.TOKENIZER)
        validationDataset = gn_dataset(validationData,labels,self.TOKENIZER)
        testDataset = gn_dataset(testData,labels,self.TOKENIZER)
        checkpoints = {}

        # TRAIN FUNCTION
        runName = experiment + str(len(checkpoints))
        print('checkpointDir: ', checkpointDir)

        args = TrainingArguments(
                                output_dir=checkpointDir,
                                save_strategy='no',
                                do_train=True, # DEFAULT: False
                                do_eval=True, # DEFAULT: False
                                do_predict=True, # DEFAULT: False
#                                 save_steps = 1200, # DEFAULT: 500
#                                 eval_steps = 400, #200, # DEFAULT: 500
                                evaluation_strategy = 'epoch', # DEFAULT: "no"
                                logging_first_step = True, # DEFAULT: False
#                                 dataloader_num_workers = 12, # 6, DEFAULT: 0
                                learning_rate = 2e-5,
                                num_train_epochs = 3,
                                per_device_train_batch_size = 16,
                                per_device_eval_batch_size = 16, #16,
                                weight_decay = 0.05 ,
                                warmup_steps = 0, # DEFAULT: 0
                                logging_dir=checkpointDir+"/logs",
                                metric_for_best_model = 'f1_binary', # DEFAULT: None
                                greater_is_better = True, # DEFAULT: None
                                
        )
        model = self.model_init(dropout=0.3)

        trainer = Trainer(
                    model,
                    args = args,
                    train_dataset = trainDataset,#train
                    tokenizer = self.TOKENIZER,
                    eval_dataset = validationDataset,#dev
                    compute_metrics = self.computeMetrics,
                )

        trainer.train()

        cp = checkpointDir + '/CheckPointModel_'+ runName

        checkpoints[cp] = trainer.evaluate(testDataset) # on test set
        print('on test set:', checkpoints[cp]) 
        test_metrics.append(checkpoints[cp])
#         trainer.predict(testDataset)
#         trainer.save_model(cp)
        
    def predict(self,text):
        dataset = gn_dataset(None,None,self.TOKENIZER)
        batchEncoding = dataset.processText(text).to(self.device)

        self.MODEL.eval()
        out = self.MODEL(batchEncoding.input_ids,attention_mask = batchEncoding.attention_mask,token_type_ids = batchEncoding.token_type_ids,return_dict = True)

        return out.logits.argmax().item()

In [13]:
def train(name,root,nLabels,labels,experiment,trainDataset,validationDataset,testDataset): 
    print('name experiment: ', name, experiment)
    model = Model(name,nLabels,labels)
    print('model model: ', model)
    analysis = model.train_loop(root+"SavedModel/"+experiment,root+name.split('/')[-1]+"TrainingCheckpoints",trainDataset,validationDataset,testDataset,labels)
    return True

In [14]:
gc.collect()
with torch.no_grad():
    torch.cuda.empty_cache()

launch = {
'distilbert': 'distilbert-base-uncased',
'deberta': 'microsoft/deberta-base',
'electra': 'google/electra-base-discriminator',
}
 
experiment = list(launch.keys())[1]+"_clickbait"
model = launch['deberta']

train(model,root,nLabels,labels,experiment,trainDataset,validationDataset,testDataset) 

name experiment:  microsoft/deberta-base deberta_clickbait
device:  cuda
configconfig DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.24.0",
  "type_vocab_size": 0,
  "vocab_size": 50265
}



Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.d

** Model:  <bound method Model.loadModel of <__main__.Model object at 0x7fb84e969f98>>
model model:  <__main__.Model object at 0x7fb84e969f98>
checkpointDir:  ./deberta-baseTrainingCheckpoints
configconfig DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.3,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.24.0",
  "type_vocab_size": 0,
  "vocab_size": 50265
}



Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.d

Epoch,Training Loss,Validation Loss,Val accuracy,F1
1,0.3537,0.48956,0.789345,0.856985
2,0.2997,0.5104,0.803579,0.866316
3,0.264,0.626502,0.793005,0.863722


***** Running Evaluation *****
  Num examples = 2459
  Batch size = 16
***** Running Evaluation *****
  Num examples = 2459
  Batch size = 16
***** Running Evaluation *****
  Num examples = 2459
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 18979
  Batch size = 16


on test set: {'eval_loss': 0.31432703137397766, 'eval_val accuracy': 0.8746509299752357, 'eval_f1': 0.9180756913116844, 'eval_runtime': 116.5062, 'eval_samples_per_second': 162.901, 'eval_steps_per_second': 10.188, 'epoch': 3.0}


True

In [15]:
# print(all_computeMetrics)

In [16]:
print(test_metrics[0]['eval_val accuracy'], '\n', test_metrics[0]['eval_f1'])

0.8746509299752357 
 0.9180756913116844
