# study text-embedding for understanding vector embeddings notebooks

In [1]:
# prepare packages
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: joblib, nltk
Successfully installed joblib-1.4.2 nltk-3.9.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
import nltk
import os

# Resource punkt_tab not found. Try using the NLTK Downloader
if not os.path.exists("./tokenizers/punkt_tab.zip"):
    nltk.download("punkt_tab", download_dir="./")

text = "We are lucky to live in an age in which we are still making discoveries"

# tokenization - splitting text into words
words = word_tokenize(text)
print(words)
# ['We', 'are', 'lucky', 'to', 'live', 'in', 'an', 'age', 'in', 'which',
#  'we', 'are', 'still', 'making', 'discoveries']

stemmer = SnowballStemmer(language="english")
stemmed_words = list(map(lambda x: stemmer.stem(x), words))
print(stemmed_words)
# ['we', 'are', 'lucki', 'to', 'live', 'in', 'an', 'age', 'in', 'which',
#  'we', 'are', 'still', 'make', 'discoveri']

import collections
bag_of_words = collections.Counter(stemmed_words)
print(bag_of_words)
# {'we': 2, 'are': 2, 'in': 2, 'lucki': 1, 'to': 1, 'live': 1,
# 'an': 1, 'age': 1, 'which': 1, 'still': 1, 'make': 1, 'discoveri': 1}

['We', 'are', 'lucky', 'to', 'live', 'in', 'an', 'age', 'in', 'which', 'we', 'are', 'still', 'making', 'discoveries']
['we', 'are', 'lucki', 'to', 'live', 'in', 'an', 'age', 'in', 'which', 'we', 'are', 'still', 'make', 'discoveri']
Counter({'we': 2, 'are': 2, 'in': 2, 'lucki': 1, 'to': 1, 'live': 1, 'an': 1, 'age': 1, 'which': 1, 'still': 1, 'make': 1, 'discoveri': 1})


In [14]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv("./env/.env"))

import dashscope
from http import HTTPStatus
from pprint import pprint

resp = dashscope.TextEmbedding.call(
    model=dashscope.TextEmbedding.Models.text_embedding_v2,
    input="We are lucky to live in an age in which we are still making discoveries.",
    dimension=1536,
)
pprint(resp['output']) if resp.status_code == HTTPStatus.OK else print(resp)

{'embeddings': [{'embedding': [0.022378576171554372,
                               -0.027432455162420308,
                               -0.00355793080956962,
                               -0.030121118785560987,
                               -0.014848296475164124,
                               -0.0009172790368421677,
                               0.012796421604872554,
                               0.011088210505959867,
                               0.02603758456094131,
                               -0.03703482524506559,
                               -0.0411587905016122,
                               0.04172482494858918,
                               -0.01638467568838737,
                               0.02468314499138924,
                               0.05433930690979056,
                               0.022338145139627445,
                               -0.06375973734876467,
                               0.004930058955589722,
                               -0.004576287426

In [18]:
# Euclidean distance (L2)
vector2 = [1, 4]
vector1 = [2, 2]
import numpy as np

sum(list(map(lambda x, y: (x - y) ** 2, vector1, vector2))) ** 0.5
# 2.2361

np.linalg.norm((np.array(vector1) - np.array(vector2)), ord=2)
# 2.2361

2.23606797749979

In [19]:
# Manhattant distance (L1)
import numpy as np

sum(list(map(lambda x, y: abs(x - y), vector1, vector2)))
# 3

np.linalg.norm((np.array(vector1) - np.array(vector2)), ord=1)
# 3.0

3.0

In [None]:
# Dot product
sum(list(map(lambda x, y: x * y, vector1, vector2)))
# 10

np.dot(vector1, vector2)
# 10

10

In [22]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp310-cp310-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp310-cp310-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp310-cp310-macosx_12_0_arm64.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading scipy-1.14.1-cp310-cp310-macosx_14_0_arm64.whl (23.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.1/23.1 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.5.2 scipy-1.14.1 threadpoolctl-3.5.0

[1m[[0m[34;49mnotice

In [25]:
import numpy as np

dot_product = sum(list(map(lambda x, y: x * y, vector1, vector2)))
norm_vector1 = sum(list(map(lambda x: x**2, vector1))) ** 0.5
norm_vector2 = sum(list(map(lambda x: x**2, vector2))) ** 0.5

dot_product / norm_vector1 / norm_vector2

# 0.8575

from sklearn.metrics.pairwise import cosine_similarity

# The function cosine_similarity expects 2D arrays. That’s why we need to reshape the numpy arrays.
cos_sim = cosine_similarity(np.array(vector1).reshape(1, -1), np.array(vector2).reshape(1, -1))[
    0
][0]
print(cos_sim)
# 0.8575
import math

math.degrees(math.acos(cos_sim))

0.8574929257125441


30.963756532073536

In [29]:
# PCA
import numpy as np
import pandas as pd

df = pd.DataFrame(resp["output"]["embeddings"])

embeddings_array = np.array(df.embedding.values.tolist())
print(embeddings_array.shape)
# (1400, 1536)

from sklearn.decomposition import PCA

pca_model = PCA(n_components=1)
pca_model.fit(embeddings_array)

pca_embeddings_values = pca_model.transform(embeddings_array)
print(pca_embeddings_values.shape)
# (1400, 2)

(1, 1536)
(1, 1)


  explained_variance_ = (S**2) / (n_samples - 1)


![alt text](image.png)
