In [6]:
import os
os.environ["OPENAI_API_KEY"] = "sk---"

## What are Embeddings?

Embeddings are numerical representations of data, like words or sentences, mapped to a multi-dimensional space.

*mapped to a multi-dimensional space* , means:

Each item (e.g., word, sentence, image) is represented as a vector with multiple numerical values (or "dimensions").

These vectors act like coordinates in a virtual space where similar items are positioned closer together and dissimilar items are farther apart.

They capture relationships between data points by placing similar items closer together in this space.

## Why Do We Need Embeddings?

**Semantic Understanding**

Traditional data can’t inherently capture "meaning" (e.g., “happy” and “joyful” are similar), but embeddings make this possible by placing similar concepts close together.

**Dimensionality Reduction**

Embeddings condense large, complex data (like sentences or paragraphs) into smaller, dense vectors, which makes it easier and faster to process.

**Improving Model Performance**

In tasks like recommendations or text classification, embeddings enhance models by providing a rich, contextualized understanding of data, improving accuracy.

## When Can We Use Embeddings?

**Search and Information Retrieval**

For search engines, embeddings help find not just exact matches but conceptually similar results, e.g., retrieving articles on “sustainability” when searching for “eco-friendly practices.”

**Recommendations**

In recommendation systems (e.g., Netflix, Spotify), embeddings allow suggesting similar movies or songs by understanding content relationships.

**Text Clustering and Classification**

Group similar documents together or classify emails into categories (spam vs. non-spam) by capturing semantic nuances in the text.

**Question Answering and Chatbots**

Embeddings enable understanding user intent by mapping questions and answers to a similar space, helping chatbots retrieve relevant responses.


In [7]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings()

In [8]:
embeddings = embeddings_model.embed_documents(
    [
        "Hello, My Name is Praveen Reddy C",
        "Hi, This is Chinnareddy",
        "Whatsup, This is Reddygaru"
    ]
)

In [9]:
len(embeddings)

3

In [13]:
embeddings[0]

[0.0008355244644917548,
 -0.00843298714607954,
 -0.005677720997482538,
 0.0029291422106325626,
 -0.014926588162779808,
 0.01142231747508049,
 -0.01544821634888649,
 -0.010218560695648193,
 -0.009904245845973492,
 -0.010700062848627567,
 0.024061767384409904,
 0.00828586146235466,
 0.01603671908378601,
 -0.015153964050114155,
 -0.0004027152608614415,
 -0.0030244397930800915,
 0.008533299900591373,
 -0.0031749093905091286,
 0.022470133379101753,
 -0.003925585653632879,
 -0.023219136521220207,
 -0.007476669270545244,
 0.006667476613074541,
 -0.02042374573647976,
 0.007536856923252344,
 -0.019086237996816635,
 -0.009155241772532463,
 -0.012231509201228619,
 0.029344923794269562,
 -0.018029605969786644,
 -0.007048666477203369,
 -0.009850746020674706,
 0.002872298238798976,
 -0.024703770875930786,
 -0.00014911123435012996,
 -0.02267075888812542,
 -0.0006018785643391311,
 -0.001869167317636311,
 0.021440250799059868,
 0.0033671760465949774,
 0.028863420709967613,
 0.0001939386420417577,
 5.74

In [15]:
embedded_query = embeddings_model.embed_query("Hello, You are from which village ?")
embedded_query

[0.00769110256806016,
 0.0047962479293346405,
 -0.014145312830805779,
 -0.022408807650208473,
 -0.0377383790910244,
 0.0496072843670845,
 -0.01619802787899971,
 -0.004730455577373505,
 -0.023250946775078773,
 -0.026672137901186943,
 0.0012278460199013352,
 0.010835967026650906,
 -0.01163205225020647,
 -0.021803518757224083,
 0.011243878863751888,
 -0.01167810708284378,
 0.022921985015273094,
 -0.02855379320681095,
 0.01590854302048683,
 -0.0052600824274122715,
 -0.008645088411867619,
 0.014776917174458504,
 0.03065914288163185,
 0.0011275130091235042,
 0.008572717197239399,
 0.0052896891720592976,
 0.014947976917028427,
 -0.018224425613880157,
 0.015974335372447968,
 -0.01998765580356121,
 -0.007388458587229252,
 0.002125086495652795,
 -0.014263738878071308,
 -0.015408521518111229,
 -0.019329734146595,
 -0.02069821022450924,
 0.004720586817711592,
 -0.01605328544974327,
 0.028395893052220345,
 0.005710758734494448,
 0.03650148585438728,
 -0.008816148154437542,
 -0.003569224150851369,
 

In [16]:
%pip install --upgrade --quiet  langchain-openai faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [19]:
from langchain.storage import LocalFileStore
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain.embeddings import CacheBackedEmbeddings

In [20]:

underlying_embeddings = OpenAIEmbeddings()

store = LocalFileStore("./cache/")

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)

In [21]:
raw_documents = TextLoader("state_of_the_union.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)

In [22]:
db = FAISS.from_documents(documents, cached_embedder)

In [23]:
from langchain.storage import InMemoryByteStore

store = InMemoryByteStore()

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)

In [24]:
pip install langchain-huggingface

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.0-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_huggingface-0.1.0-py3-none-any.whl (20 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.1.0
Note: you may need to restart the kernel to use updated packages.


### Hugging Face is a popular platform and company that provides tools and resources for natural language processing (NLP) and machine learning (ML). 

In [25]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="llama3",
)

In [2]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [4]:
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
# print(embeddings)