In [1]:
%pip install openai tiktoken pandas scikit-learn matplotlib --upgrade --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m787.8/787.8 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.1 which is incompatible.
dask-cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.1 which is incompatible.
cudf-cu12 25.6.0 require

# Introduction to Vector Embeddings

In this notebook, we’ll cover the theory behind text embeddings, show you how to generate and use them with the OpenAI API, and then give you a hands-on exercise to try embedding search yourself.

## Theory: What Are Embeddings?
- An embedding is a high‑dimensional vector that captures the semantic meaning of text.
- Similar pieces of text have vectors that are close together in vector space (cosine similarity, dot product).
- Common use cases: search, clustering, recommendation, anomaly detection, classification.

## Getting Started: Install & Import
Make sure you have the OpenAI and tiktoken packages installed.

In [2]:
import matplotlib.pyplot as plt
import numpy as np
from openai import OpenAI
from sklearn.decomposition import PCA
import tiktoken

## 1. Generate an Embedding
Use `text-embedding-3-small` to turn text into a 1536‑dimensional vector.

In [8]:
# Initialize the client
# Note: In a real application, you would use an environment variable or secure method
# to store your API key. This is just for demonstration.
client = OpenAI(
    # Replace with your actual API key or use: api_key=os.environ.get("OPENAI_API_KEY")
    api_key="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"
)


In [9]:
# Hàm chuyển text thành vecto, sử dụng model='text-embedding-3-small'
def get_embedding(text, model='text-embedding-3-small'):
    text_clean = text.replace('\n', ' ') # Loại bỏ các ký tự k cần thiết: xuống dòng, khoảng trắng
    resp = client.embeddings.create(input=[text_clean], model=model) # tạo vecto
    return np.array(resp.data[0].embedding) # Lưu thành 1 mảng

sample = 'The quick brown fox jumps over the lazy dog.'
emb = get_embedding(sample)
print(f'Vector length: {len(emb)}')
print('First 5 dims:', emb[:5])

Vector length: 1536
First 5 dims: [-0.0183968  -0.0072255   0.00362544 -0.05420079 -0.02269785]


## 2. Count Tokens Before Embedding
Use `tiktoken` to estimate input size and control cost.

In [10]:
# Hàm đếm số token có trong câu
def num_tokens_from_string(s: str, encoding_name: str = 'cl100k_base') -> int:
    enc = tiktoken.get_encoding(encoding_name) # encoding thành các tokens
    return len(enc.encode(s))

print('Sample token count:', num_tokens_from_string(sample))

Sample token count: 10


## 3. Dimensionality Reduction & Visualization
Project a few sentence embeddings down to 2D with PCA.

# Ý tưởng

- Bạn có nhiều câu (sentences).

- Mỗi câu được chuyển thành một vector embedding (bằng hàm get_embedding(s)).

- Các vector này thường có số chiều lớn (ví dụ 768, 1024...).

- Dùng PCA để giảm toàn bộ xuống 2 chiều (2D).

- Trực quan hóa các câu trên mặt phẳng 2D — giúp “nhìn thấy” sự gần nhau/khác biệt giữa ý nghĩa các câu.

In [12]:
sentences = [
    'I love machine learning',
    'OpenAI creates powerful AI models',
    'The sky is clear today',
    'I enjoy hiking in the mountains',
    'This restaurant has great food'
]
vectors = np.vstack([get_embedding(s) for s in sentences]) # Chuyển các câu trên thành vecto Embedding

pca = PCA(n_components=2) # Giảm số chiều của vecto = 2
points = pca.fit_transform(vectors)

'''
plt.figure(figsize=(8,6))
plt.scatter(points[:,0], points[:,1])
for i, txt in enumerate(sentences):
    plt.annotate(txt, (points[i,0], points[i,1]))
plt.title('2D PCA of Sentence Embeddings')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()
'''

"\nplt.figure(figsize=(8,6))\nplt.scatter(points[:,0], points[:,1])\nfor i, txt in enumerate(sentences):\n    plt.annotate(txt, (points[i,0], points[i,1]))\nplt.title('2D PCA of Sentence Embeddings')\nplt.xlabel('PC1')\nplt.ylabel('PC2')\nplt.show()\n"

---

## Exercise for You
1. Pick 5 of your own short sentences.
2. Embed them using the `get_embedding` function.
3. Compute pairwise cosine similarities and identify the two most similar sentences.
4. (Bonus) Visualize them in 2D with PCA as above.

In [21]:
from sklearn.metrics.pairwise import cosine_similarity
sentence = ['Tôi thích đọc sách', 'Tôi thích nghe nhạc', 'Anh ấy đang chạy bộ', 'Mẹ tôi làm nghề giáo viên']
vectors = np.vstack([get_embedding(s) for s in sentence])
print(vectors)

similarity = cosine_similarity(vectors)
print(similarity)

import numpy as np

# Đặt giá trị trên đường chéo thành -1 để loại bỏ so sánh chính nó
np.fill_diagonal(similarity, -1)
# Tìm vị trí của giá trị lớn nhất
idx = np.unravel_index(np.argmax(similarity), similarity.shape) #argmax lấy giá trị max; unravel viết lại index, ví dụ index = 13 biến đổi về tọa độ gồm 2 biến
print(f"\nHai câu giống nhau nhất là:\n- \"{sentence[idx[0]]}\"\n- \"{sentence[idx[1]]}\"")
print(f"Cosine similarity: {similarity[idx]:.4f}")


[[ 0.01620053  0.0004295  -0.08704468 ...  0.00370571  0.00337373
  -0.00143892]
 [ 0.01901255 -0.02254858 -0.09381574 ...  0.00332463  0.00497948
  -0.01252983]
 [ 0.02310837  0.0010586  -0.03352727 ... -0.00077936 -0.0163608
  -0.01691928]
 [ 0.00689444 -0.00827906 -0.05735124 ... -0.01935195 -0.01881121
  -0.05830163]]
[[1.         0.60576539 0.37345729 0.36652316]
 [0.60576539 1.         0.34801537 0.32130442]
 [0.37345729 0.34801537 1.         0.3559856 ]
 [0.36652316 0.32130442 0.3559856  1.        ]]

Hai câu giống nhau nhất là:
- "Tôi thích đọc sách"
- "Tôi thích nghe nhạc"
Cosine similarity: 0.6058
