# Fine-Tuning BERT

In this notebook we'll fine-tuned the BERT model for crypto news sentiment analysis.

In [None]:
%pip install transformers datasets evaluate

Login to HuggingFace hub to save the model later:

In [None]:
from huggingface_hub import notebook_login 

notebook_login()

Load our crypto headlines dataset:

In [None]:
from datasets import load_dataset, Value, Features, ClassLabel

data_files = {
    'train': 'data/crypto-news-train.csv',
    'validation': 'data/crypto-news-val.csv',
    'test': 'data/crypto-news-test.csv',
}

class_names = ['negative', 'neutral', 'positive']
features = Features({
    'text': Value('string'), 
    'label': ClassLabel(names=class_names)
})

dataset = load_dataset('csv', data_files=data_files, features=features)
dataset = dataset.map(lambda x: {'text': x['text'].lower()})

Build the tokenizer and tokenize the dataset:

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize(batch):
    tokens = tokenizer(batch['text'], truncation=True)
    return tokens

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.rename_column('label', 'labels')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Build and train the model:

In [None]:
from transformers import (
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
)
import evaluate
import numpy as np

id2label = {
    '0': 'negative',
    '1': 'neutral',
    '2': 'positive', 
}
label2id = {
    'negative': '0',
    'neutral': '1',
    'positive': '2',
}

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, 
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
)
repo_name = 'bert-finetuned-cryptos'

training_args = TrainingArguments(
    repo_name,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    push_to_hub=True,
)

def compute_metrics(eval_preds):
    metric = evaluate.load('accuracy')
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.evaluate()

Final push to the hub:

In [None]:
trainer.push_to_hub()

Try the model:

In [None]:
from transformers import pipeline

classifier = pipeline(model='flowfree/bert-finetuned-cryptos')

In [11]:
classifier('Second Ethereum Testnet Successfully Simulates Shanghai Hard Fork')

[{'label': 'LABEL_1', 'score': 0.8704282641410828}]