<a href="https://colab.research.google.com/github/codeREXus/langchain-learnings/blob/main/langchain4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence-transformers==4.1.0 | tail -n 1

Successfully installed sentence-transformers-4.1.0


In [2]:
import math
import numpy as np
import scipy
import torch
from sentence_transformers import SentenceTransformer

In [3]:
documents = [
    'Bugs introduced by the intern had to be squashed by the lead developer.',
    'Bugs found by the quality assurance engineer were difficult to debug.',
    'Bugs are common throughout the warm summer months, according to the entomologist.',
    'Bugs, in particular spiders, are extensively studied by arachnologists.'
]

In [None]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [5]:
embed = model.encode(documents)

In [6]:
embed.shape

(4, 384)

#Eucledian distance


In [7]:
def eucledian_distance(x,y):
  sqsum=sum((x-y)**2 for x,y in zip(x,y))
  return math.sqrt(sqsum)

In [8]:
eucledian_distance(embed[0],embed[3])

7.155784092025114

In [9]:
l2_dist_manual = np.zeros([4,4])
for i in range(embed.shape[0]):
    for j in range(embed.shape[0]):
         if j > i:
            l2_dist_manual[i,j] = eucledian_distance(embed[i], embed[j])
         elif i > j:
            l2_dist_manual[i,j] = l2_dist_manual[j,i]

l2_dist_manual

array([[0.        , 5.96178921, 7.33939755, 7.15578409],
       [5.96178921, 0.        , 7.7686165 , 7.39359022],
       [7.33939755, 7.7686165 , 0.        , 5.91992832],
       [7.15578409, 7.39359022, 5.91992832, 0.        ]])

In [10]:
l2_dist_scipy = scipy.spatial.distance.cdist(embed, embed, 'euclidean')
l2_dist_scipy

array([[0.        , 5.9617894 , 7.33940012, 7.15578259],
       [5.9617894 , 0.        , 7.768616  , 7.39359112],
       [7.33940012, 7.768616  , 0.        , 5.919928  ],
       [7.15578259, 7.39359112, 5.919928  , 0.        ]])

In [11]:
np.allclose(l2_dist_manual, l2_dist_scipy)

True

#dot proouct

In [12]:
def dot_product(x,y):
  return sum(x*y for x,y in zip(x,y))

In [27]:
mat=np.dot(embed[0],embed[1])
dot_dist=-mat
print(dot_dist)

-18.535406


In [30]:
mat=np.matmul(embed[1],embed[0])
dot_dist=-mat
print(dot_dist)

-18.535406


In [20]:
print(dot_product(embed[0],embed[3]))

7.8309326


In [24]:
dot_product_manual=np.zeros([4,4])
for i in range(embed.shape[0]):
  for j in range(embed.shape[0]):
    dot_product_manual[i,j]=dot_product(embed[i],embed[j])
dot_product_manual

array([[33.74442291, 18.5354023 ,  8.56981659,  7.83093262],
       [18.5354023 , 38.86933136,  7.88997459,  8.66340351],
       [ 8.56981659,  7.88997459, 37.26202011, 17.66957092],
       [ 7.83093262,  8.66340351, 17.66957092, 33.12267685]])

#Cosine Similarity

In [31]:
l2_norms = np.sqrt(np.sum(embed**2, axis=1))
l2_norms

array([5.808995 , 6.234527 , 6.1042614, 5.7552304], dtype=float32)

In [32]:
l2_norms_reshaped = l2_norms.reshape(-1,1)
l2_norms_reshaped

array([[5.808995 ],
       [6.234527 ],
       [6.1042614],
       [5.7552304]], dtype=float32)

In [35]:
normalized_embeddings_manual = embed/l2_norms_reshaped
normalized_embeddings_manual

array([[-0.03925702, -0.04243019, -0.00054966, ...,  0.07837524,
         0.10917504,  0.0925298 ],
       [-0.05740863, -0.0514618 ,  0.02560462, ..., -0.01130905,
         0.14876868,  0.0551402 ],
       [ 0.03326025, -0.04406527,  0.02667829, ..., -0.03219229,
        -0.00553689,  0.09757369],
       [-0.00740943, -0.07944357, -0.01655276, ..., -0.10083128,
         0.02996996,  0.01586011]], dtype=float32)

In [38]:
normalized_embeddings_torch = torch.nn.functional.normalize(
    torch.from_numpy(embed)
).numpy()
normalized_embeddings_torch

array([[-0.03925702, -0.04243019, -0.00054966, ...,  0.07837524,
         0.10917504,  0.0925298 ],
       [-0.05740864, -0.0514618 ,  0.02560462, ..., -0.01130905,
         0.1487687 ,  0.05514021],
       [ 0.03326024, -0.04406527,  0.02667829, ..., -0.03219229,
        -0.00553689,  0.09757368],
       [-0.00740943, -0.07944357, -0.01655276, ..., -0.10083128,
         0.02996996,  0.01586011]], dtype=float32)

In [39]:
np.allclose(normalized_embeddings_manual, normalized_embeddings_torch)

True

In [42]:
dot_product(normalized_embeddings_manual[0], normalized_embeddings_manual[3])

np.float32(0.23423398)

In [51]:
cosine_similarity_manual = np.empty([4,4])
for i in range(normalized_embeddings_manual.shape[0]):
    for j in range(normalized_embeddings_manual.shape[0]):
        cosine_similarity_manual[i,j] = dot_product(
            normalized_embeddings_manual[i],
            normalized_embeddings_manual[j]
        )

cosine_similarity_manual

array([[0.99999994, 0.51179701, 0.24167825, 0.23423398],
       [0.51179701, 1.        , 0.20731877, 0.2414474 ],
       [0.24167825, 0.20731877, 1.00000072, 0.50295597],
       [0.23423398, 0.2414474 , 0.50295597, 0.99999994]])