<a href="https://colab.research.google.com/github/gupta24789/sentence-transformers/blob/main/02_train_sentence_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Train Sentence Transformers Models

In [1]:
!pip install -q datasets
!pip install -q sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, models, InputExample, losses

## Define Model

In [3]:
# Train model from scratch
## Step 1: use an existing language model
word_embedding_model = models.Transformer('distilroberta-base', max_seq_length=512, do_lower_case=True)
## Step 2: use a pool function over the token embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
## Join steps 1 and 2 using the modules argument
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## Load Dataset

In [4]:
dataset_id = "embedding-data/QQP_triplets"
dataset = load_dataset(dataset_id)

Downloading readme:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/183M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['set'],
        num_rows: 101762
    })
})

In [6]:
dataset['train']['set'][0]

{'query': 'Why in India do we not have one on one political debate as in USA?',
 'pos': ['Why cant we have a public debate between politicians in India like the one in US?'],
 'neg': ['Can people on Quora stop India Pakistan debate? We are sick and tired seeing this everyday in bulk?',
  'Why do politicians, instead of having a decent debate on issues going in and around the world, end up fighting always?',
  'Can educated politicians make a difference in India?',
  'What are some unusual aspects about politics and government in India?',
  'What is debate?',
  'Why does civic public communication and discourse seem so hollow in modern India?',
  'What is a Parliamentary debate?',
  "Why do we always have two candidates at the U.S. presidential debate. yet the ballot has about 7 candidates? Isn't that a misrepresentation of democracy?",
  'Why is civic public communication and discourse so hollow in modern India?',
  "Aren't the Presidential debates teaching our whole country terrible c

In [7]:
## Convert the examples into InputExamples
train_examples = []
n_examples = 10000     ## considering 10000 samples only
train_data = dataset['train']['set']

## Here from one record, we are considering : anchor, first pos, and first neg
for i in range(n_examples):
  example = train_data[i]
  train_examples.append(InputExample(texts=[example['query'], example['pos'][0], example['neg'][0]]))

In [8]:
## DataLoaders
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

In [14]:
## loss function
train_loss = losses.TripletLoss(model=model, distance_metric= losses.TripletDistanceMetric.COSINE)

## Train Model

In [16]:
num_epochs = 5
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data##

## training
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          # evaluator = None,
          # evaluation_steps = None,
          checkpoint_path = "st_checkpoints",
          checkpoint_save_total_limit  = 3,
          save_best_model = True,
          output_path = "st_training"
          )

## save model to disk
model.save("models")

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/625 [00:00<?, ?it/s]

In [17]:
!ls st_checkpoints/

2500  3000  3125


In [18]:
!ls st_training

1_Pooling			   merges.txt	      sentence_bert_config.json  vocab.json
config.json			   model.safetensors  special_tokens_map.json
config_sentence_transformers.json  modules.json       tokenizer_config.json
eval				   README.md	      tokenizer.json


In [21]:
## Load the model
model = SentenceTransformer("models")

In [22]:
dataset['train']['set'][0]

{'query': 'Why in India do we not have one on one political debate as in USA?',
 'pos': ['Why cant we have a public debate between politicians in India like the one in US?'],
 'neg': ['Can people on Quora stop India Pakistan debate? We are sick and tired seeing this everyday in bulk?',
  'Why do politicians, instead of having a decent debate on issues going in and around the world, end up fighting always?',
  'Can educated politicians make a difference in India?',
  'What are some unusual aspects about politics and government in India?',
  'What is debate?',
  'Why does civic public communication and discourse seem so hollow in modern India?',
  'What is a Parliamentary debate?',
  "Why do we always have two candidates at the U.S. presidential debate. yet the ballot has about 7 candidates? Isn't that a misrepresentation of democracy?",
  'Why is civic public communication and discourse so hollow in modern India?',
  "Aren't the Presidential debates teaching our whole country terrible c

In [29]:
## Evaluate Sentence
query = ['Why in India do we not have one on one political debate as in USA?']

pos = [
    'Why cant we have a public debate between politicians in India like the one in US?'
]

neg = [
    'Can people on Quora stop India Pakistan debate? We are sick and tired seeing this everyday in bulk?'
]

In [30]:
query_emb = model.encode(query)
pos_emb = model.encode(pos)
neg_emb = model.encode(neg)

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
cosine_similarity(query_emb, pos_emb)

array([[0.8406319]], dtype=float32)

In [33]:
cosine_similarity(query_emb, neg_emb)

array([[0.44891113]], dtype=float32)