# Gemini Embedding

In [None]:
import os
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
from dotenv import load_dotenv
load_dotenv()

genai.configure(api_key=os.getenv("GEMINI_API"))

result = genai.embed_content(
    model="text-embedding-004",
    content="سلام امروز چقدر راه رفتی",
    task_type="retrieval_document",  # retrieval_query or classification or...
    title="Embedding example"
)

embedding_values = result['embedding']
embedding_length = len(embedding_values)

print(f"length of embedding: {embedding_length}")
print(f"first 5: {embedding_values[:5]}")

length of embedding: 768
first 5: [-0.008003132, 0.03931392, -0.04013841, 0.009164641, 0.0716528]


# hugging face

In [10]:
from sentence_transformers import SentenceTransformer
sentences = ["اسم کتاب چی هست", "جمله ی دوم همینه"]

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
embeddings = model.encode(sentences)
print(embeddings)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[[ 4.67720255e-03  2.98004746e-01 -3.63382339e-01  3.48445326e-01
  -1.07778840e-01  5.20727895e-02  1.45372659e-01  9.31090787e-02
   1.12428762e-01  8.02300945e-02  1.10947050e-01 -1.14201613e-01
  -6.33173138e-02  2.08913460e-02 -4.66325670e-01  1.11772060e-01
   7.98962042e-02  2.09235564e-01  2.91619182e-01  1.18358493e-01
   5.87779224e-01  4.55964178e-01  1.49813175e-01 -2.40006551e-01
   3.40767384e-01 -3.18013757e-01  6.90682158e-02 -2.65730053e-01
   6.75988570e-02 -2.06098393e-01 -7.28421882e-02  1.52260184e-01
   6.24511689e-02  2.48062447e-01  1.22039944e-01  3.65762740e-01
   5.66347949e-02 -1.95140895e-02  1.37698367e-01 -1.01545848e-01
  -1.07091390e-01  1.74193621e-01 -1.68828592e-01  9.45731401e-02
   4.60740812e-02  1.42683119e-01  5.00584953e-02 -2.17981279e-01
  -4.57825279e-03  5.53493649e-02  2.47613490e-02  8.03303197e-02
  -3.71452779e-01 -1.42236650e-01  2.47394219e-01  9.30766463e-02
   1.77098706e-01 -4.48250724e-03  6.04440086e-02 -1.39558852e-01
   3.02516

similarity search

In [None]:
import numpy as np
def vectors_similarity (A,B):
    """
    Cosine similarity: dot(A,B) / (||A|| * ||B||)
    check how close is A to B
    """
    
    dot_product = np.dot(A,B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    return dot_product/norm_a * norm_b

In [94]:
mydocument = ["خوبم تو چطوری",
              "لپتاپ منم ارتقا داده شده",
              "کامپیوترم را تعویض کردم",
              "این سیبه خیلی خوشمزه بود",
              "خیلی استرس دارم از دیروز",
              "باید چیزای خراب رو عوض کرد"]
myquery = "یک روز پیش حالت چطور بود"

In [None]:
mydocument = [
    "Artificial intelligence is transforming the tech industry",
    "Machine learning algorithms can predict stock market trends",
    "Deep learning requires large amounts of training data",
    "Natural language processing helps computers understand human language",
    "Computer vision enables machines to see and interpret images"
]

myquery = "What is computer vision?"

In [None]:
def find_similarities (query,docs,models,top=1):
    """chose between HF and GM"""
    # hugging face
    if models == "HF":
        query_embeddings = model.encode(query)
        docs_embeddings = model.encode(docs)
    # gemini
    elif models == "GM":
        
        result = genai.embed_content(
        model="text-embedding-004",
        content=query,
        task_type="retrieval_document",
        title="Embedding example"
        )
        query_embeddings = result['embedding']
    # 1 by 1 docs embedding
        docs_embeddings = []
        for doc in docs:
            doc_result = genai.embed_content(
                model="models/text-embedding-004",
                content=doc,
                task_type="retrieval_document"
            )
            docs_embeddings.append(doc_result['embedding'])
        
    else :
        print("only HF or GM are allowed")
        

    similarities = []
    for i, doc_emb in enumerate(docs_embeddings):
        # Cosine similarity: dot(A,B) / (||A|| * ||B||)
        cosine_sim = vectors_similarity(query_embeddings, doc_emb)
        similarities.append((docs[i],cosine_sim))
    
    similarities.sort(key=lambda x: x[1],reverse=True)
    top_results = similarities[:top]
    # results
    print(f"\nTop {top} most similar documents using {models}:\nquery : {myquery}\ndocs : {mydocument}")
    print("-" * 50)
    
    return top_results
            

In [99]:
final_resault = find_similarities(myquery,mydocument,"HF",2)
print(final_resault)


Top 2 most similar documents using HF:
query : یک روز پیش حالت چطور بود
docs : ['خوبم تو چطوری', 'لپتاپ منم ارتقا داده شده', 'کامپیوترم را تعویض کردم', 'این سیبه خیلی خوشمزه بود', 'خیلی استرس دارم از دیروز', 'باید چیزای خراب رو عوض کرد']
--------------------------------------------------
[('خوبم تو چطوری', np.float32(8.604617)), ('خیلی استرس دارم از دیروز', np.float32(8.354697))]
