#### Clickbait Detection by usiing Headline's text using Deberta and Electra models

In [1]:
from sklearn.metrics import f1_score, balanced_accuracy_score, mean_squared_error, confusion_matrix, recall_score, precision_score, accuracy_score, log_loss
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig,Trainer, TrainingArguments, BertweetTokenizer, BertConfig
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import random
import torch
import json
import os
import re
import gc

In [2]:
path = './Webis17HeadlineClsData/'  # Webis17HeadlineClsData or Webis17HeadlineArticleClsData
root = "./"

In [3]:
df_train = pd.read_csv(path+'train.csv')
df_validation = pd.read_csv(path+'validation.csv')
df_test = pd.read_csv(path+'test.csv')

In [4]:
df_train.head()

Unnamed: 0,postText,truthClass
0,UK’s response to modern slavery leaving victim...,no-clickbait
1,this is good,clickbait
2,"The ""forgotten"" Trump roast: Relive his brutal...",no-clickbait
3,Meet the happiest #dog in the world!,clickbait
4,Tokyo's subway is shut down amid fears over an...,no-clickbait


In [5]:
len(df_train), len(df_validation), len(df_test)

(19538, 2459, 18979)

In [6]:
def normalize_sentence(sentence, replace_url=False, replace_usr=False):
    if not replace_url and not replace_usr:
        #print(replace_url,replace_usr)
        return sentence
    new_sentence = []
    for word in sentence.split(" "):
        # print(word)
        # @mentions, @users
        if (word.startswith("@") or word.startswith('"@')) and replace_usr:
            new_sentence.append('@user')
        # URL: https, http
        elif (word.startswith("http:") or word.startswith("https:")) and replace_url:
            new_sentence.append('internet_site')
        else:
            new_sentence.append(word)
    # remove extra " "
    new_sentence = re.sub(' +', ' ', " ".join(new_sentence))
    return new_sentence.strip()

In [7]:
def get_dataset(df_train,df_dev,df_test,root,replace_url=False,replace_usr=False):
    trainDataset = df_train[['postText', 'truthClass']].copy()
    trainDataset['x'] = trainDataset.postText.astype(str).apply(lambda a: normalize_sentence(a,replace_url,replace_usr))
    trainDataset['y'] = trainDataset.truthClass.str.strip()
    print('Total amount of train',len(trainDataset.index))

    validationDataset = df_dev[['postText', 'truthClass']].copy()
    validationDataset['x'] = validationDataset.postText.astype(str).apply(lambda a: normalize_sentence(a,replace_url,replace_usr))
    validationDataset['y'] = validationDataset.truthClass.str.strip()
    print('Total amount of validation',len(validationDataset.index))

    testDataset = df_test[['postText', 'truthClass']].copy()
    testDataset['x'] = testDataset.postText.astype(str).apply(lambda a: normalize_sentence(a,replace_url,replace_usr))
    testDataset['y'] = testDataset.truthClass.str.strip()
    print('Total amount of test',len(testDataset.index))

    return trainDataset, validationDataset, testDataset

trainDataset, validationDataset, testDataset = get_dataset(df_train, df_validation, df_test, ".", True, True)

labels = list(set(trainDataset.truthClass.tolist()))
labels.sort(key=lambda item: (-len(item), item), reverse=True)
nLabels = len(labels)
print('Labels:', labels)

trainDataset = trainDataset.drop(['postText', 'truthClass'], axis=1)
validationDataset = validationDataset.drop(['postText', 'truthClass'], axis=1)
testDataset = testDataset.drop(['postText', 'truthClass'], axis=1)

Total amount of train 19538
Total amount of validation 2459
Total amount of test 18979
Labels: ['clickbait', 'no-clickbait']


In [8]:
print(testDataset.head())

                                                   x             y
0  Johnny Manziel on Browns' No. 1 pick Myles Gar...  no-clickbait
1  Fabio: California Is a 'Mess' Because of Liber...  no-clickbait
2            "He's been huge for us this year, man."     clickbait
3  New Bears quarterback Mitchell Trubisky was gr...  no-clickbait
4  It's not enough to let employees work flexible...  no-clickbait


In [9]:
class Trainer(Trainer):
    def evaluate(self, eval_dataset= None, ignore_keys=None):
        outputMetrics = super().evaluate(eval_dataset, ignore_keys)
#         print('outputMetrics: ', outputMetrics)
        return outputMetrics

In [10]:
class gn_dataset(Dataset):
    def __init__(self,data,labels,tokenizer):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer

    def processText(self, text):
        tokenized = self.tokenizer(text, truncation=True)

        return tokenized

    def __len__(self):
        return len(self.data.index)

    def __getitem__(self,i):
        row = self.data.iloc[i]
        x = self.processText(row['x']).data

        try:
            y = self.labels.index(row['y'])
        except:
            y = len(self.labels) - 1 

        x['labels'] = y
        return x

    def randomItem(self):
        return self.__getitem__(random.randint(0,self.__len__()))


In [11]:
all_computeMetrics = []
test_metrics = []

In [12]:
class Model:
    def __init__(self,modelPath = '',nLabels = 3, labels=None):
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") # put cpu for more extensive error desc...
        print('device: ', self.device)
        self.maxLength = 1024  # max len we have is 124 for postext

        self.nLabels = nLabels

        self.loadModel(modelPath) 
        print('** Model: ', self.loadModel)

        self.labels = labels

    def model_init(self, dropout = 0.1):
        config = AutoConfig.from_pretrained(
                    self.MODEL_PATH,
                    num_labels=self.nLabels,
                    return_dict = True,
                    hidden_dropout_prob = dropout
                )

        print('configconfig', config)

        #return AutoModel.from_pretrained(self.MODEL_PATH,config=config).to(self.device)
        return AutoModelForSequenceClassification.from_pretrained(self.MODEL_PATH,config=config).to(self.device)

    def computeMetrics(self,evalPrediction):
        yPred = evalPrediction.predictions.argmax(1)
        yTrue = evalPrediction.label_ids

        metrics = {}
        
        metrics['accuracy'] = accuracy_score(yTrue, yPred)
        metrics['f1'] = f1_score(yTrue, yPred)

        # original paper: https://link.springer.com/chapter/10.1007/978-3-319-30671-1_72/tables/2 / ROC-AUC, Precision, Recall
        metrics['balanced_accuracy'] = balanced_accuracy_score(yTrue, yPred) # deal with imbalanced datasets
        metrics['f1_macro'] = f1_score(yTrue, yPred, average='macro') # Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
        metrics['mean_squared_error'] = mean_squared_error(yTrue, yPred)
        if 'click' in self.labels[0]:
            indexes = [i for i,x in enumerate(self.labels) if x == 'clickbait']
            metrics['f1_binary'] = f1_score(yTrue, yPred, average='binary',pos_label=indexes[0])
        metrics['confusion_matrix'] = str(confusion_matrix(yTrue, yPred))
        
        tn, fp, fn, tp = confusion_matrix(yTrue, yPred).ravel()
        recall = recall_score(yTrue, yPred) 
        precision = precision_score(yTrue, yPred)
        metrics['precision'] = precision
        metrics['recall'] = recall
        metrics['log_loss'] = log_loss(yTrue, yPred)
        
        all_computeMetrics.append(metrics)
        
        return metrics

    def loadModel(self,modelPath):
        self.MODEL_PATH = modelPath
        self.MODEL = self.model_init()
        self.TOKENIZER = AutoTokenizer.from_pretrained(self.MODEL_PATH,model_max_length = self.maxLength, use_fast=True)
        
    def saveModel(self,saveDir):
        print('saving model at: ', saveDir)
        self.MODEL.save_pretrained(saveDir)
        self.TOKENIZER.save_pretrained(saveDir)

    def train_loop(self,saveDir,checkpointDir,trainingData,validationData,testData,labels):
        trainDataset = gn_dataset(trainingData,labels,self.TOKENIZER)
        validationDataset = gn_dataset(validationData,labels,self.TOKENIZER)
        testDataset = gn_dataset(testData,labels,self.TOKENIZER)
        checkpoints = {}

        # TRAIN FUNCTION
        runName = experiment + str(len(checkpoints))
        print('checkpointDir: ', checkpointDir)

        args = TrainingArguments(
                                output_dir=checkpointDir,
                                save_strategy='no',
                                do_train=True, # DEFAULT: False
                                do_eval=True, # DEFAULT: False
                                do_predict=True, # DEFAULT: False
#                                 save_steps = 1200, # DEFAULT: 500
                                eval_steps = 400, #200, # DEFAULT: 500
                                evaluation_strategy = 'steps', # DEFAULT: "no"
                                logging_first_step = True, # DEFAULT: False
                                dataloader_num_workers = 12, # 6, DEFAULT: 0
                                learning_rate = 2e-5,
                                num_train_epochs = 3,
                                per_device_train_batch_size = 32,
                                per_device_eval_batch_size = 32, #16,
                                weight_decay = 0.05 ,
                                warmup_steps = 0, # DEFAULT: 0
                                logging_dir=checkpointDir+"/logs",
                                metric_for_best_model = 'f1_binary', # DEFAULT: None
                                greater_is_better = True, # DEFAULT: None
                                
        )
        model = self.model_init(dropout=0.3)

        trainer = Trainer(
                    model,
                    args = args,
                    train_dataset = trainDataset,#train
                    tokenizer = self.TOKENIZER,
                    eval_dataset = validationDataset,#dev
                    compute_metrics = self.computeMetrics,
                )

        trainer.train()

        cp = checkpointDir + '/CheckPointModel_'+ runName

        checkpoints[cp] = trainer.evaluate(testDataset) # on test set
        print('on test set:', checkpoints[cp]) 
        test_metrics.append(checkpoints[cp])
#         trainer.predict(testDataset)

#         trainer.save_model(cp)
        
    def predict(self,text):
        dataset = gn_dataset(None,None,self.TOKENIZER)
        batchEncoding = dataset.processText(text).to(self.device)

        self.MODEL.eval()
        out = self.MODEL(batchEncoding.input_ids,attention_mask = batchEncoding.attention_mask,token_type_ids = batchEncoding.token_type_ids,return_dict = True)

        return out.logits.argmax().item()

In [13]:
def train(name,root,nLabels,labels,experiment,trainDataset,validationDataset,testDataset): 
    print('name experiment: ', name, experiment)
    model = Model(name,nLabels,labels)
    print('model model: ', model)
    analysis = model.train_loop(root+"SavedModel/"+experiment,root+name.split('/')[-1]+"TrainingCheckpoints",trainDataset,validationDataset,testDataset,labels)
    return True

In [14]:
gc.collect()
with torch.no_grad():
    torch.cuda.empty_cache()

launch = {
'distilbert': 'distilbert-base-uncased',
'deberta': 'microsoft/deberta-base',
'electra': 'google/electra-base-discriminator',
}
 
experiment = list(launch.keys())[2]+"_clickbait"
model = launch['electra']

train(model,root,nLabels,labels,experiment,trainDataset,validationDataset,testDataset) 
print('done')

name experiment:  google/electra-base-discriminator electra_clickbait
device:  cuda
configconfig ElectraConfig {
  "_name_or_path": "google/electra-base-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.o

** Model:  <bound method Model.loadModel of <__main__.Model object at 0x7f533f0804a8>>
model model:  <__main__.Model object at 0x7f533f0804a8>
checkpointDir:  ./electra-base-discriminatorTrainingCheckpoints
configconfig ElectraConfig {
  "_name_or_path": "google/electra-base-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.3,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.o

Step,Training Loss,Validation Loss,Loss,Accuracy,F1,Balanced Accuracy,F1 Macro,Mean Squared Error,F1 Binary,Confusion Matrix,Precision,Recall,Runtime,Samples Per Second,Steps Per Second
400,0.6861,7.514677,0.579633,0.782432,0.856299,0.686188,0.7043,0.217568,0.552301,[[ 330 432]  [ 103 1594]],0.786772,0.939305,2.0103,1223.227,38.304
800,0.412,7.570859,0.610022,0.780805,0.854599,0.687178,0.704756,0.219195,0.554913,[[ 336 426]  [ 113 1584]],0.78806,0.933412,2.0348,1208.497,37.842
1200,0.3401,7.444445,0.616129,0.784465,0.85722,0.690553,0.708875,0.215535,0.560531,[[ 338 424]  [ 106 1591]],0.789578,0.937537,2.0111,1222.731,38.288
1600,0.3268,7.289932,0.571203,0.788939,0.858622,0.703194,0.721211,0.211061,0.583801,[[ 364 398]  [ 121 1576]],0.798379,0.928698,1.9966,1231.594,38.566


***** Running Evaluation *****
  Num examples = 2459
  Batch size = 32
You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using 

on test set: {'eval_loss': 0.3154461979866028, 'eval_accuracy': 0.8665893882712472, 'eval_f1': 0.9128099173553719, 'eval_balanced_accuracy': 0.8117711273385666, 'eval_f1_macro': 0.814444877942095, 'eval_mean_squared_error': 0.13341061172875282, 'eval_f1_binary': 0.7160798385288181, 'eval_confusion_matrix': '[[ 3193  1322]\n [ 1210 13254]]', 'eval_precision': 0.9093029637760702, 'eval_recall': 0.9163440265486725, 'eval_log_loss': 4.607894983914673, 'eval_runtime': 11.4341, 'eval_samples_per_second': 1659.856, 'eval_steps_per_second': 51.95, 'epoch': 3.0}
done


In [15]:
print(all_computeMetrics)

[{'accuracy': 0.7824318828792192, 'f1': 0.8562986838571045, 'balanced_accuracy': 0.6861877607078726, 'f1_macro': 0.704299969543615, 'mean_squared_error': 0.2175681171207808, 'f1_binary': 0.5523012552301255, 'confusion_matrix': '[[ 330  432]\n [ 103 1594]]', 'precision': 0.786771964461994, 'recall': 0.939304655274013, 'log_loss': 7.514677022109419}, {'accuracy': 0.7808052053680358, 'f1': 0.8545994065281899, 'balanced_accuracy': 0.6871783926243162, 'f1_macro': 0.7047563506629388, 'mean_squared_error': 0.2191947946319642, 'f1_binary': 0.5549132947976878, 'confusion_matrix': '[[ 336  426]\n [ 113 1584]]', 'precision': 0.7880597014925373, 'recall': 0.9334119033588686, 'log_loss': 7.570858521904077}, {'accuracy': 0.7844652297681984, 'f1': 0.8572198275862069, 'balanced_accuracy': 0.690553191752622, 'f1_macro': 0.708875253759936, 'mean_squared_error': 0.21553477023180154, 'f1_binary': 0.5605306799336651, 'confusion_matrix': '[[ 338  424]\n [ 106 1591]]', 'precision': 0.7895781637717122, 'recal

In [16]:
print(test_metrics)

[{'eval_loss': 0.3154461979866028, 'eval_accuracy': 0.8665893882712472, 'eval_f1': 0.9128099173553719, 'eval_balanced_accuracy': 0.8117711273385666, 'eval_f1_macro': 0.814444877942095, 'eval_mean_squared_error': 0.13341061172875282, 'eval_f1_binary': 0.7160798385288181, 'eval_confusion_matrix': '[[ 3193  1322]\n [ 1210 13254]]', 'eval_precision': 0.9093029637760702, 'eval_recall': 0.9163440265486725, 'eval_log_loss': 4.607894983914673, 'eval_runtime': 11.4341, 'eval_samples_per_second': 1659.856, 'eval_steps_per_second': 51.95, 'epoch': 3.0}]
