Ref: https://realpython.com/chromadb-vector-database/

In [1]:
import spacy

In [2]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
nlp = spacy.load("en_core_web_md")

#### Check embeddings for simple words

In [4]:
# Check a simple embedding
dog_embedding = nlp.vocab["dog"].vector
print (type(dog_embedding), dog_embedding.shape)

<class 'numpy.ndarray'> (300,)


In [5]:
print (dog_embedding[0:10])

[  1.233     4.2963   -7.9738  -10.121     1.8207    1.4098   -4.518
  -5.2261   -0.29157   0.95234]


### Function to compute cosine similarity

In [6]:
import numpy as np

def compute_cosine_similarity(u: np.ndarray, v: np.ndarray) -> float:
    """Compute the cosine similarity between two vectors"""

    return (u @ v) / (np.linalg.norm(u) * np.linalg.norm(v))


In [7]:
dog_e = nlp.vocab["dog"].vector
cat_e = nlp.vocab["cat"].vector
truck_e = nlp.vocab["truck"].vector
airplane_e = nlp.vocab["airplane"].vector

In [8]:
print ('dog and cat similarity', compute_cosine_similarity(dog_e, cat_e))

dog and cat similarity 0.8220817


In [9]:
print ('dog and truck similarity', compute_cosine_similarity(dog_e, truck_e))

dog and truck similarity 0.25462714


In [10]:
print ('dog and truck similarity', compute_cosine_similarity(cat_e, airplane_e))

dog and truck similarity 0.15464622


In [11]:
print ('truck and airplane similarity', compute_cosine_similarity(truck_e, airplane_e))

truck and airplane similarity 0.5391841


In [12]:
kitten_e = nlp.vocab["kitten"].vector
print ('cat and kitten similarity', compute_cosine_similarity(cat_e, kitten_e))

cat and kitten similarity 1.0000001


### Similarities between sentences

In [13]:
!python -m pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.0


In [14]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [15]:
model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [16]:
text = ["The canine barked loudly.",
        "The dog made a noisy bark.",
        "He ate a lot of pizza.",
        "He devoured a large quantity of pizza pie.",
        "The class was boring",
        "It was an intersting class"]

In [17]:
text_embeddings = model.encode(text)

In [18]:
for i in range(len(text)):
  for j in range(i+1, len(text)):
    print (text[i], text[j], compute_cosine_similarity(text_embeddings[i], text_embeddings[j]))

The canine barked loudly. The dog made a noisy bark. 0.7768617
The canine barked loudly. He ate a lot of pizza. 0.09128275
The canine barked loudly. He devoured a large quantity of pizza pie. 0.11757181
The canine barked loudly. The class was boring 0.07011989
The canine barked loudly. It was an intersting class 0.025663558
The dog made a noisy bark. He ate a lot of pizza. 0.14035322
The dog made a noisy bark. He devoured a large quantity of pizza pie. 0.13570152
The dog made a noisy bark. The class was boring 0.20429964
The dog made a noisy bark. It was an intersting class 0.13854001
He ate a lot of pizza. He devoured a large quantity of pizza pie. 0.78713405
He ate a lot of pizza. The class was boring 0.19097075
He ate a lot of pizza. It was an intersting class 0.16154131
He devoured a large quantity of pizza pie. The class was boring 0.07667441
He devoured a large quantity of pizza pie. It was an intersting class 0.07849484
The class was boring It was an intersting class 0.6986763


In [19]:
for i in range(len(text)):
  for j in range(i+1, len(text)):
    cos_sim = compute_cosine_similarity(text_embeddings[i], text_embeddings[j])
    if (cos_sim > 0.5):
      print (text[i], text[j], cos_sim)

The canine barked loudly. The dog made a noisy bark. 0.7768617
He ate a lot of pizza. He devoured a large quantity of pizza pie. 0.78713405
The class was boring It was an intersting class 0.6986763
