In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [2]:
### OpenAI Embeddings
from langchain_openai import OpenAIEmbeddings

apikey = os.environ.get("OPENAI_API_KEY")

text = "Hello, i'm learning Embeddings with different providers"

embeddings = OpenAIEmbeddings(
    api_key=apikey,
    model="text-embedding-ada-002"
)

embeddings = embeddings.embed_query(text)

print(f"Text: {text}")
print(f"Embedding length: {len(embeddings)}")
print(embeddings)

  from .autonotebook import tqdm as notebook_tqdm


Text: Hello, i'm learning Embeddings with different providers
Embedding length: 1536
[-0.03330886363983154, 0.000804183422587812, 0.0079774996265769, -0.02858438901603222, -0.005258947145193815, 0.03177803009748459, 0.002377083757892251, -0.007495814003050327, -0.0174330472946167, -0.0328865647315979, 0.0040415371768176556, 0.013289233669638634, -0.00478386040776968, -0.00244141835719347, 0.004500127863138914, 0.008426192216575146, 0.008927673101425171, 0.019016670063138008, 0.00987124815583229, -0.020059220492839813, -0.017710180953145027, -0.007165892515331507, -0.0035862454678863287, -0.013427800498902798, 0.003069918602705002, -0.006245411932468414, 0.015427124686539173, -0.018884699791669846, -0.012141107581555843, -0.01871314086019993, 0.021695630624890327, 0.0044044507667422295, 0.004836647771298885, -0.022500639781355858, 0.0026707137003540993, -0.016020983457565308, 0.010227562859654427, -0.021642843261361122, 0.02779257856309414, 0.004001946654170752, 0.010682854801416397, 0.

In [3]:
sentences = [
    "The sun sets in the west.",
    "Birds chirp in the morning.",
    "The moon rises in the east.",
    "Flowers bloom in spring.",
    "Autumn leaves change color.",
    "Winter snow blankets the ground."
]

embeddings = OpenAIEmbeddings(
    api_key=apikey,
    model="text-embedding-ada-002"
)

embed = embeddings.embed_documents(sentences)
print(f"Text: {sentences}")
print(f"Embedding length: {len(embed)}")
print(embed)

Text: ['The sun sets in the west.', 'Birds chirp in the morning.', 'The moon rises in the east.', 'Flowers bloom in spring.', 'Autumn leaves change color.', 'Winter snow blankets the ground.']
Embedding length: 6
[[0.030930763110518456, -0.001680323970504105, 0.00403277762234211, -0.014825258404016495, -0.011515820398926735, 0.039610836654901505, -0.006500453222543001, -0.006260407157242298, 0.006353225093334913, -0.019344529137015343, 0.006385230924934149, 0.0257713682949543, 0.012501610442996025, -0.0021652174182236195, -0.00020373928418848664, -0.012219956144690514, 0.02469596080482006, -0.035949330776929855, 0.037024740129709244, -0.016335949301719666, -0.010581240057945251, 0.011784671805799007, 0.003269430249929428, 0.015375764109194279, -0.0064140367321670055, -0.007406227756291628, 0.0032838331535458565, -0.02182820811867714, 0.03303036838769913, 0.0027893378864973783, -0.00011962306598434225, -0.01193190086632967, -0.02031751722097397, -0.014991690404713154, -0.018986061215400

### Cosine similarityity with OpenAI Embeddings"""

In [4]:
# Finding similary 
sentences = [
    "The sun sets in the west.",
    "Birds chirp in the morning.",
    "The moon rises in the east.",
    "Flowers bloom in spring.",
    "Autumn leaves change color.",
    "Winter snow blankets the ground."
]

In [9]:
import numpy as np

def cosine_similarity(vec1, vec2):
    
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    return dot_product/(norm_vec1 * norm_vec2)

In [None]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

sentence_embeddings = embeddings.embed_documents(sentences)
sentence_embeddings

In [None]:
# Calculate the similarity between all pairs of sentences

for i in range(len(sentences)):
    for j in range(i+1, len(sentences)):
        similarity=cosine_similarity(sentence_embeddings[i], sentence_embeddings[j])
        
        print(f"'{sentences[i]}' vs '{sentences[j]}'")
        print(f"Similarity: {similarity:.3f}\n")

In [12]:
### Example - Semantic Search - Retriev the similar sentence
# Test semantic search
documents = [
    "LangChain is a powerful tool for building language models.",
    "Python is widely used in the development of LangChain applications.",
    "LangChain provides a simple and intuitive API for working with language models.",
    "Python's rich ecosystem of libraries makes it an ideal choice for developing complex LangChain applications.",
    "LangChain can be easily integrated into existing Python projects to enhance their natural language processing capabilities.",
    "Python's strong community support ensures that there are plenty of resources available for learning and using LangChain."
]

query = "What is Langchain"

In [17]:
def semantic_search(query, documents, embedding_model, top_k=3):
    """Simple semantic search implementation"""
    
    ## Embed query and document
    query_embedding = embedding_model.embed_query(query) # Frage (Query)
    doc_embedding = embedding_model.embed_documents(documents) # Liste mit möglichen Antworten
    
    ## Calculate the similarity score
    similarities = []
    
    for i, doc_emb in enumerate(doc_embedding):
        similarity = cosine_similarity(query_embedding, doc_emb)
        similarities.append((similarity, documents[i]))

    ## Sort by similarity
    similarities.sort(reverse=True)
    
    return similarities[:top_k]    

In [18]:
result = semantic_search(query, documents, embeddings)
result

[(np.float64(0.6180332388593057),
  'LangChain is a powerful tool for building language models.'),
 (np.float64(0.6043054295255502),
  'LangChain provides a simple and intuitive API for working with language models.'),
 (np.float64(0.5771165382287559),
  'Python is widely used in the development of LangChain applications.')]