# Embedding Techniques

Converting text into vectors

In [7]:
import os
from dotenv import load_dotenv

load_dotenv() # Load all environment variables

True

In [10]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [11]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x10f243c40>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x10fb2de10>, model='text-embedding-3-large', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [12]:
text = "Un elefante se columpiaba sobre la tela de una araña"
query_result = embeddings.embed_query(text)
query_result

[-0.0066023035906255245,
 -0.02004498802125454,
 0.001275312970392406,
 -0.014005941338837147,
 0.03688463568687439,
 0.020126283168792725,
 -0.028104791417717934,
 0.00673585943877697,
 -0.01516729686409235,
 0.018837178125977516,
 -0.010870283469557762,
 0.020219191908836365,
 -0.019557218998670578,
 0.01697901077568531,
 -0.052586156874895096,
 0.029242919757962227,
 -0.010754147544503212,
 0.011781946755945683,
 0.03477096930146217,
 -0.04055451974272728,
 0.012472952716052532,
 -0.011323211714625359,
 -0.0002965084568131715,
 0.0006899174768477678,
 -0.015376340597867966,
 -0.012809745967388153,
 0.01521375123411417,
 0.0265253484249115,
 -0.00862886756658554,
 0.024527817964553833,
 -0.05077444389462471,
 -0.013901419937610626,
 0.03751176595687866,
 0.014807276427745819,
 -0.06582560390233994,
 -0.01887201890349388,
 -0.0152834327891469,
 -0.017153214663267136,
 -0.01981271617114544,
 -0.004436376038938761,
 -0.01632865145802498,
 0.017339030280709267,
 0.01548086293041706,
 0.0

In [14]:
len(query_result)

3072

In [15]:
embedding_1024 = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=1024)
embedding_1024

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x10fb67be0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x1181a62c0>, model='text-embedding-3-large', dimensions=1024, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [16]:
text = "Un elefante se columpiaba sobre la tela de una araña"
query_result = embedding_1024.embed_query(text)
len(query_result)

1024

In [17]:
## Text Loader
from langchain_community.document_loaders import TextLoader

loader = TextLoader('../resources/speech.txt')
text_documents=loader.load()
text_documents

[Document(metadata={'source': '../resources/speech.txt'}, page_content='In deep learning, transformer is a neural network architecture based on the multi-head attention mechanism, in which text is converted to numerical representations called tokens, and each token is converted into a vector via lookup from a word embedding table.[1] At each layer, each token is then contextualized within the scope of the context window with other (unmasked) tokens via a parallel multi-head attention mechanism, allowing the signal for key tokens to be amplified and less important tokens to be diminished.\n\nTransformers have the advantage of having no recurrent units, therefore requiring less training time than earlier recurrent neural architectures (RNNs) such as long short-term memory (LSTM).[2] Later variations have been widely adopted for training large language models (LLMs) on large (language) datasets.[3]\n\nThe modern version of the transformer was proposed in the 2017 paper "Attention Is All Y

In [18]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
final_documents = text_splitter.split_documents(text_documents)
final_documents

[Document(metadata={'source': '../resources/speech.txt'}, page_content='In deep learning, transformer is a neural network architecture based on the multi-head attention mechanism, in which text is converted to numerical representations called tokens, and each token is converted into a vector via lookup from a word embedding table.[1] At each layer, each token is then contextualized within the scope of the context window with other (unmasked) tokens via a parallel multi-head attention mechanism, allowing the signal for key tokens to be amplified and less important'),
 Document(metadata={'source': '../resources/speech.txt'}, page_content='for key tokens to be amplified and less important tokens to be diminished.'),
 Document(metadata={'source': '../resources/speech.txt'}, page_content='Transformers have the advantage of having no recurrent units, therefore requiring less training time than earlier recurrent neural architectures (RNNs) such as long short-term memory (LSTM).[2] Later varia

In [21]:
## Vector Embedding and Vector Store DB
from langchain_community.vectorstores import Chroma

db = Chroma.from_documents(final_documents, embedding=embedding_1024)
db

<langchain_community.vectorstores.chroma.Chroma at 0x11a092cb0>

In [23]:
## Retrieve the results from Query Vector Store DB
query = "Que es un Transformer?"
retrieved_results = db.similarity_search(query)
retrieved_results

[Document(metadata={'source': '../resources/speech.txt'}, page_content='In deep learning, transformer is a neural network architecture based on the multi-head attention mechanism, in which text is converted to numerical representations called tokens, and each token is converted into a vector via lookup from a word embedding table.[1] At each layer, each token is then contextualized within the scope of the context window with other (unmasked) tokens via a parallel multi-head attention mechanism, allowing the signal for key tokens to be amplified and less important'),
 Document(metadata={'source': '../resources/speech.txt'}, page_content='In deep learning, transformer is a neural network architecture based on the multi-head attention mechanism, in which text is converted to numerical representations called tokens, and each token is converted into a vector via lookup from a word embedding table.[1] At each layer, each token is then contextualized within the scope of the context window wit