# **Fine-Tuning a Custom Transformer**

# 0. Installing library + Importing Stuff


In [1]:
%%capture

!pip install --upgrade datasets sentence_transformers huggingface_hub

In [25]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

In [18]:
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer, models
import torch
import torch.nn as nn
from datasets import load_dataset, Dataset
from sentence_transformers.losses import CosineSimilarityLoss, CoSENTLoss
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)

In [19]:
class Transformer(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def forward(self, inputs, task_type=None):
        inputs['token_embeddings'] = self.model(**inputs).last_hidden_state
        return inputs

    def tokenize(self, sentence):
        return self.tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)

    def get_word_embedding_dimension(self) -> int:
        return self.model.config.hidden_size

In [28]:
transformer = Transformer('bert-base-uncased')
pooling = models.Pooling(transformer.get_word_embedding_dimension(), pooling_mode="mean")
normalize = models.Normalize()

model = SentenceTransformer(modules=[transformer, pooling, normalize])

In [33]:
train = load_dataset("stsb_multi_mt", name="en", split="train")
train = train.rename_column("similarity_score", "score")
train = train.map(lambda row: {'score': row['score']/5})

In [34]:
# Loss Function
loss = CosineSimilarityLoss(model=model)

# Training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="models/finetuned",
    report_to="none",
    learning_rate=1e-4,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    logging_steps=100
)

# Defining trainer
trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=train,
    loss=loss,
    args=args,
)

# Training
trainer.train()

KeyboardInterrupt: 

In [5]:
# Testing on a few made up sentences
query = ['I am in love']
docs = ['I love walking', 'I love in am', 'I like spaghetti']

query_emb = model.encode(query)
docs_emb  = model.encode(docs)

similarities = model.similarity(query_emb, docs_emb)
for i, sim in enumerate(similarities[0]):
  print(f'{query} -> {docs[i]}: {sim}')

['I am in love'] -> I love walking: 0.7280453443527222
['I am in love'] -> I love in am: 0.7389846444129944
['I am in love'] -> I like spaghetti: 0.6425665020942688
