# Dependencies

In [None]:
!pip install transformers[torch]
!pip install datasets
!pip install evaluate

Collecting transformers[torch]
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.19.4-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.7/311.7 kB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m101.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m54.2 MB/s

# Datasets

In [None]:
import torch
from datasets import load_dataset, concatenate_datasets

cb_dataset1 = load_dataset("marksverdhei/clickbait_title_classification")
cb_dataset2 = load_dataset("SotirisLegkas/clickbait")
cb_dataset3 = load_dataset("evoreign/clickbait_headline")

# HAVE TO USE TEXT AND LABEL FOR BERT

cb_dataset1["train"] = cb_dataset1["train"].rename_column("title", "text")
cb_dataset1["train"] = cb_dataset1["train"].rename_column("clickbait", "label")

cb_dataset3["train"] = cb_dataset3["train"].rename_column("headline", "text")
cb_dataset3["train"] = cb_dataset3["train"].rename_column("clickbait", "label")

dataset = concatenate_datasets([cb_dataset1['train'], cb_dataset2['train'],
                                cb_dataset2['test'], cb_dataset2['validation'],
                                cb_dataset3['train']])

In [None]:
from datasets import DatasetDict, Dataset
train_ratio = 0.75
test_ratio = 0.15
validation_ratio = 1 - train_ratio - test_ratio

train_size = int(train_ratio * len(dataset))
test_size = int(test_ratio * len(dataset))

train_dataset, test_dataset, validation_dataset = dataset[:train_size], dataset[train_size:train_size + test_size], dataset[train_size + test_size:]

dataset = DatasetDict({
    'train': Dataset.from_dict(train_dataset),
    'test': Dataset.from_dict(test_dataset),
    'validation': Dataset.from_dict(validation_dataset),
})

# Tokenization

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-cased')
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dataset = tokenized_datasets

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    evaluation_strategy="epoch",
    output_dir="./checkpoints",
    save_steps=20000,
    eval_steps=500,
    num_train_epochs=6,
    save_total_limit=5,
    learning_rate=5e-5,
)

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.6976,0.586369
2,0.6878,0.505155
3,0.6875,0.659749
4,0.6844,0.48656
5,0.6865,0.60855
6,0.6822,0.548295


TrainOutput(global_step=66798, training_loss=0.6898534161383715, metrics={'train_runtime': 5622.1055, 'train_samples_per_second': 95.051, 'train_steps_per_second': 11.881, 'total_flos': 6676981585201440.0, 'train_loss': 0.6898534161383715, 'epoch': 6.0})

# Save the model

In [None]:
import torch
model_path = 'model/clickbert.pth'
model_path_bin = 'model/clickbert.bin'
tokenizer_path = 'model/tokenizer'

torch.save(model.state_dict(), model_path)
torch.save(model.state_dict(), model_path_bin)
tokenizer.save_pretrained(tokenizer_path)

!cp model/tokenizer/tokenizer_config.json model/tokenizer/config.json

NameError: ignored

# Convert model

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from collections import OrderedDict
import torch

pytorch_model_path = 'model/clickbert.pth'

model_state_dict = torch.load(pytorch_model_path, map_location=torch.device('cpu'))

config = BertConfig.from_pretrained('bert-base-cased')

transformers_model = BertForSequenceClassification(config)

def remove_prefix(state_dict, prefix):
    new_state_dict = OrderedDict()
    for key, value in state_dict.items():
        if key.startswith(prefix):
            new_key = key[len(prefix):]
            new_state_dict[new_key] = value
        else:
            new_state_dict[key] = value
    return new_state_dict

prefix_to_remove = 'bert.'

model_state_dict = remove_prefix(model_state_dict, prefix_to_remove)

transformers_model.load_state_dict(model_state_dict, strict=False)

transformers_model_path = 'model/tokenizer'

transformers_model.save_pretrained(transformers_model_path)



import torch

original_model_path = 'model/clickbert.pth'

model_state_dict = torch.load(original_model_path, map_location=torch.device('cpu'))

bin_export_path = 'model/clickbert.bin'
torch.save(model_state_dict, bin_export_path)

(…)bert-base-cased/resolve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# Try the model

## Load the model

In [None]:
from transformers import BertModel, BertTokenizer, pipeline, BertConfig, BertForSequenceClassification
import os
from collections import OrderedDict

model_directory = 'model/'

config = BertConfig.from_pretrained(model_directory + "tokenizer")

model = BertForSequenceClassification(config)

model_state_dict = torch.load(os.path.join(model_directory + '/clickbert.bin'))

def remove_prefix(state_dict, prefix):
    new_state_dict = OrderedDict()
    for key, value in state_dict.items():
        if key.startswith(prefix):
            new_key = key[len(prefix):]
            new_state_dict[new_key] = value
        else:
            new_state_dict[key] = value
    return new_state_dict

prefix_to_remove = 'bert.'

model_state_dict = remove_prefix(model_state_dict, prefix_to_remove)

model.load_state_dict(model_state_dict, strict=False)
tokenizer_path =  model_directory + "tokenizer"
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

## Try the text classification

In [None]:
classifier = pipeline(task='text-classification', model=model, tokenizer=tokenizer)

result = trainer.evaluate(tokenized_datasets["test"])

print(result)

In [None]:
input_text1 = "Shocking New Discovery! Scientists Uncover a Miracle Cure for Aging You Won't Believe!"
input_text2 = "Recent Advancements in Gerontology"
print(classifier(input_text1))
print(classifier(input_text2))

[{'label': 'LABEL_1', 'score': 0.5785963535308838}]
[{'label': 'LABEL_1', 'score': 0.5441251993179321}]
