# Text Embedding with VertexAI

In [6]:
import vertexai_start

In [7]:
from vertexai.language_models import TextEmbeddingModel

In [8]:
en_model = TextEmbeddingModel.from_pretrained("gemini-embedding-001") # text-multilingual-embedding-002	for multilingual in/ouputs 
embed_1 = en_model.get_embeddings(["What is life?"])
embed_2 = en_model.get_embeddings(["What is the purpose of life?"])

In [9]:
embed_1[0].values
embed_2[0].values

[-0.024127071723341942,
 -0.0036823658738285303,
 0.019122881814837456,
 -0.045203179121017456,
 -0.0066692098043859005,
 -0.00686145294457674,
 0.006011000834405422,
 0.032163891941308975,
 0.03766962140798569,
 -0.011483423411846161,
 0.010055312886834145,
 -0.0032033564057201147,
 -0.02461911551654339,
 0.05870945751667023,
 0.10891919583082199,
 0.016245005652308464,
 -0.004418294411152601,
 0.0016109909629449248,
 0.0017815462779253721,
 0.00945871789008379,
 0.004649253096431494,
 -0.018503699451684952,
 0.003331894287839532,
 0.003537842771038413,
 0.0004763447504956275,
 -0.003904040204361081,
 -0.0021302548702806234,
 0.005551616661250591,
 0.030776960775256157,
 0.01019053440541029,
 0.00938574131578207,
 -0.010028674267232418,
 -0.004991940688341856,
 0.043587666004896164,
 -0.010862119495868683,
 -0.013059147633612156,
 0.005009833723306656,
 -0.010333787649869919,
 -0.018585698679089546,
 -0.01799461990594864,
 -0.034618884325027466,
 0.0020425959955900908,
 0.006860306486

In [10]:
import numpy as np

In [11]:
np.dot(embed_1[0].values,embed_2[0].values) # Cosine similarity

np.float64(0.6921740630891202)

In [12]:
embed_combined = en_model.get_embeddings(["banana","apple"])
print(len(embed_combined))
np.dot(embed_combined[0].values,embed_combined[1].values) # Cosine similarity

2


np.float64(0.7256425928902285)

# Task type

# Practical Example

use embedding to match questions with answers in a knowledge base.

This approach is the foundation for building more complex question-answering systems using embeddings like ChatGPT.

In [22]:
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
en_model = TextEmbeddingModel.from_pretrained("gemini-embedding-001") # text-multilingual-embedding-002	for multilingual in/ouputs 
question = TextEmbeddingInput("Who is Peter parker?",task_type="QUESTION_ANSWERING")
question_output = en_model.get_embeddings([question])



In [23]:
# Create a knowledge base (FAQ)
knowledge_base = [
    "Peter Parker is Spider-Man, a superhero from Marvel Comics",
    "Spider-Man has spider powers like wall crawling and web shooting",
    "Peter Parker works as a photographer for the Daily Bugle",
    "Spider-Man's real identity is Peter Benjamin Parker",
    "Peter Parker was bitten by a radioactive spider"
]

# Get embeddings for each answer in the knowledge base
kb_inputs = [TextEmbeddingInput(text, task_type="RETRIEVAL_DOCUMENT") for text in knowledge_base]
kb_embeddings = en_model.get_embeddings(kb_inputs)

In [24]:
# Compare question with knowledge base
question_embedding = question_output[0].values  # "Who is Peter Parker?" embedding

similarities = []
for i, kb_embed in enumerate(kb_embeddings):
    similarity = np.dot(question_embedding, kb_embed.values)
    similarities.append((similarity, knowledge_base[i]))

# Sort by similarity (highest first)
similarities.sort(reverse=True)

print("Question: Who is Peter Parker?")
print(f"Best answer: {similarities[0][1]} (similarity: {similarities[0][0]:.4f})")
print("\nAll matches:")
for sim, answer in similarities:
    print(f"  {sim:.4f}: {answer}")

Question: Who is Peter Parker?
Best answer: Peter Parker is Spider-Man, a superhero from Marvel Comics (similarity: 0.7443)

All matches:
  0.7443: Peter Parker is Spider-Man, a superhero from Marvel Comics
  0.7183: Spider-Man's real identity is Peter Benjamin Parker
  0.7141: Peter Parker was bitten by a radioactive spider
  0.6961: Peter Parker works as a photographer for the Daily Bugle
  0.6779: Spider-Man has spider powers like wall crawling and web shooting


## Key Insight: Foundation of Modern AI

**This is the foundation for Chatbots (LLMs)**, which compares a **knowledge base** with a question to see which response is most accurate based on their **similarity**. 

**Why we convert text to vectors (arrays):**
- Computers can't directly compare "meanings" of words
- But they CAN compute mathematical similarity between numbers
- Vectors capture semantic meaning in mathematical space
- Similar meanings → similar vectors → high similarity scores

**The Process:**
1. **Question** → Vector (embedding)
2. **Knowledge Base** → Multiple vectors (one per fact/answer)
3. **Compare** → Calculate similarity scores
4. **Return** → Most similar answer

This same principle powers ChatGPT, Claude, and all modern AI assistants!