In [1]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq sentence-transformers --progress-bar off

In [2]:
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


In [4]:
model= SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# The maximum number of tokens the model can take
model.max_seq_length

384

`max_seq_length`: The maximum length of the input sequence. Sequences longer than this will be truncated, and shorter ones will be padded.

`do_lower_case`: If set to True, the input text will be converted to lowercase before processing.

In [5]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [6]:
corpus=[
    "The advancements in artificial intelligence are reshaping the future of technology.",
    "Climate change is a pressing issue that affects ecosystems worldwide.",
    "Machine learning algorithms can analyze large datasets to uncover hidden patterns.",
    "The impact of global warming is evident in rising sea levels and extreme weather events.",
    "Artificial intelligence has the potential to revolutionize healthcare by improving diagnostics.",
    "Renewable energy sources, such as solar and wind, are essential for sustainable development.",
    "Deep learning models have achieved remarkable success in image and speech recognition.",
    "The transition to electric vehicles is crucial for reducing carbon emissions and combating climate change."
]

In [7]:
embeddings= model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
embeddings.shape,embeddings[0].shape

(torch.Size([8, 768]), torch.Size([768]))

In [15]:
import numpy as np

sim= np.zeros((len(corpus), len(corpus)))

for i in range(len(corpus)):
  for j in range(len(corpus)):
    sim[i][j]= util.cos_sim(embeddings[i],embeddings[j])

In [16]:
sim

array([[1.        , 0.16297255, 0.45295593, 0.17532232, 0.55196786,
        0.1293166 , 0.43910599, 0.2264314 ],
       [0.16297255, 1.        , 0.10984908, 0.70988548, 0.05482854,
        0.3543101 , 0.04193312, 0.3509146 ],
       [0.45295593, 0.10984908, 1.00000012, 0.1758628 , 0.47038227,
        0.10410829, 0.35010752, 0.05787906],
       [0.17532232, 0.70988548, 0.1758628 , 1.        , 0.11514729,
        0.34582919, 0.0770978 , 0.3505438 ],
       [0.55196786, 0.05482854, 0.47038227, 0.11514729, 1.00000012,
        0.0205896 , 0.35690048, 0.06958929],
       [0.1293166 , 0.3543101 , 0.10410829, 0.34582919, 0.0205896 ,
        1.        , 0.04950111, 0.3946324 ],
       [0.43910599, 0.04193312, 0.35010752, 0.0770978 , 0.35690048,
        0.04950111, 1.00000024, 0.07097   ],
       [0.2264314 , 0.3509146 , 0.05787906, 0.3505438 , 0.06958929,
        0.3946324 , 0.07097   , 1.00000012]])

In [17]:
result= util.semantic_search(embeddings, embeddings)

In [18]:
result

[[{'corpus_id': 0, 'score': 0.9999999403953552},
  {'corpus_id': 4, 'score': 0.5519678592681885},
  {'corpus_id': 2, 'score': 0.452955961227417},
  {'corpus_id': 6, 'score': 0.4391060471534729},
  {'corpus_id': 7, 'score': 0.22643138468265533},
  {'corpus_id': 3, 'score': 0.17532232403755188},
  {'corpus_id': 1, 'score': 0.1629725694656372},
  {'corpus_id': 5, 'score': 0.12931658327579498}],
 [{'corpus_id': 1, 'score': 1.0},
  {'corpus_id': 3, 'score': 0.7098854780197144},
  {'corpus_id': 5, 'score': 0.35431015491485596},
  {'corpus_id': 7, 'score': 0.3509146571159363},
  {'corpus_id': 0, 'score': 0.1629725694656372},
  {'corpus_id': 2, 'score': 0.10984910279512405},
  {'corpus_id': 4, 'score': 0.05482853204011917},
  {'corpus_id': 6, 'score': 0.04193313047289848}],
 [{'corpus_id': 2, 'score': 1.0000001192092896},
  {'corpus_id': 4, 'score': 0.4703821837902069},
  {'corpus_id': 0, 'score': 0.452955961227417},
  {'corpus_id': 6, 'score': 0.35010749101638794},
  {'corpus_id': 3, 'score':

## Train our own model

In [1]:
from sentence_transformers import InputExample, losses, evaluation, SentenceTransformer, util, SentencesDataset
from torch.utils.data import DataLoader

  from tqdm.autonotebook import tqdm, trange


In [5]:
dataset= SentencesDataset([
    InputExample(
        texts=[
            "The advancements in artificial intelligence are reshaping the future of technology.",
            "Climate change is a pressing issue that affects ecosystems worldwide."
            ], label=0.9
        )
    ],model
)

In [6]:
dataloader= DataLoader(dataset, shuffle=True)
loss= losses.CosineSimilarityLoss(model=model)

In [7]:
save_path= "trained_model"

In [33]:
# !pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[?25hDown

In [8]:
model.fit(train_objectives=[(dataloader, loss)], epochs=10, output_path=save_path)

Step,Training Loss


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [9]:
trained_model= SentenceTransformer(save_path)

In [10]:
embed1= trained_model.encode("The advancements in artificial intelligence are reshaping the future of technology.")
embed2= trained_model.encode("Climate change is a pressing issue that affects ecosystems worldwide.")

In [11]:
util.cos_sim(embed1,embed2)

tensor([[0.1642]])