# Load Dependencies

In [None]:
pip install openai langchain langchain-community pinecone-client python-dotenv langchain-pinecone tiktoken protoc_gen_openapiv2 sentence-transformers

Collecting openai
  Downloading openai-1.37.0-py3-none-any.whl.metadata (22 kB)
Collecting langchain
  Downloading langchain-0.2.11-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.2.10-py3-none-any.whl.metadata (2.7 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.0-py3-none-any.whl.metadata (19 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting langchain-pinecone
  Downloading langchain_pinecone-0.1.3-py3-none-any.whl.metadata (1.7 kB)
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting protoc_gen_openapiv2
  Downloading protoc_gen_openapiv2-0.0.1-py3-none-any.whl.metadata (1.5 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl.met

In [None]:
import openai
import langchain
import os
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
import logging

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

logging.basicConfig(level=logging.DEBUG,
                    format='[%(levelname)s] - %(message)s ',
                    handlers=[
                        logging.FileHandler('/content/langchaindemo.log', mode='w'),
                        logging.StreamHandler(),
                    ],
                    force=True)
logger = logging.getLogger(__name__)
logger.info("Langchain Demo Initialized")

[INFO] - Langchain Demo Initialized 


# Document Loader

Load data from a source as Document's. A Document is a piece of text and associated metadata.

https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.directory.DirectoryLoader.html

In [None]:
def get_docs():
    """
    Loads each file into one document (knowledge base)
    :return: docs
    """

    loader = DirectoryLoader(  # Reads custom data from local files
        path="docs",
        glob="*.txt",
        loader_cls=TextLoader  # Loader class to use for loading files
    )

    docs = loader.load()
    return docs

In [None]:
docs = get_docs()
docs[0]

[DEBUG] - Processing file: docs/alumnos.txt 
[DEBUG] - Processing file: docs/materia.txt 
[DEBUG] - Processing file: docs/temario.txt 


Document(metadata={'source': 'docs/alumnos.txt'}, page_content='Las profesoras son la Ing. Eugenia Piñeiro y la Ing. Marina Fuster\nLos alumnos son estudiantes de la carrera ingeniería informática del ITBA')

## Metadata

It can often be useful to tag ingested documents with structured metadata, such as the title, tone, or length of a document, to allow for more targeted similarity search later. (organizing, filtering, additional info)

In [None]:
for i, doc in enumerate(docs):
  doc.metadata.update({"id": i})

docs

[Document(metadata={'source': 'docs/alumnos.txt', 'id': 0}, page_content='Las profesoras son la Ing. Eugenia Piñeiro y la Ing. Marina Fuster\nLos alumnos son estudiantes de la carrera ingeniería informática del ITBA'),
 Document(metadata={'source': 'docs/materia.txt', 'id': 1}, page_content='Denominación de la materia: Temas Avanzados de Deep Learning\n\n\n\nDocente responsable de la materia:  Eugenia Piñeiro\n\nEquipo docente: Marina Fuster y Eugenia Piñeiro\n\nCarga horaria total: 24 horas\n\nPresentación de la materia:\nEsta materia se enfoca en dos aspectos cruciales de la temática modelos de lenguaje. Por un lado, busca profundizar en el aspecto técnico de los mismos, construyendo sobre los pasos anteriores que llevan hasta su reciente desarrollo. \nPor otro lado, se tiene como objetivo entender el estado del arte y los desafíos teórico-prácticos que se encuentran abiertos en la actualidad. \n\nObjetivos de aprendizaje:\nExponer a los alumnos a las generalidades del funcionamiento

# Text Splitters

Split a long document into smaller chunks that can fit into your model's context window

https://js.langchain.com/v0.1/docs/modules/data_connection/document_transformers/

Los separadores pueden ser pasados como argumento y se puede usar la funcion 'create_documents' si no utilizamos un DocumentLoader

In [None]:
text_splitter_test = RecursiveCharacterTextSplitter(
        separators = ["\n\n", "\n", ".", " "],
        chunk_size = 10,
        chunk_overlap= 0
    )
documents = ["Actividades prácticas previstas:La materia involucra el desarrollo de un Transformer en Python permitiendo el uso de librerías y utilizando técnicas como RAG y finetuning.\n\nLos alumnos propondrán la aplicación, fomentando así la creatividad. Asimismo, se proporcionarán papers relacionados con los conceptos de la materia, los cuales los alumnos deberán debatir y exponer"]
chunks_test = text_splitter_test.create_documents(documents)
chunks_test

[Document(page_content='Actividades'),
 Document(page_content=' prácticas'),
 Document(page_content=' previstas:La'),
 Document(page_content='materia'),
 Document(page_content=' involucra'),
 Document(page_content='el'),
 Document(page_content=' desarrollo'),
 Document(page_content='de un'),
 Document(page_content=' Transformer'),
 Document(page_content='en Python'),
 Document(page_content=' permitiendo'),
 Document(page_content='el uso de'),
 Document(page_content=' librerías'),
 Document(page_content='y'),
 Document(page_content=' utilizando'),
 Document(page_content='técnicas'),
 Document(page_content='como RAG'),
 Document(page_content='y'),
 Document(page_content=' finetuning'),
 Document(page_content='.'),
 Document(page_content='Los'),
 Document(page_content='alumnos'),
 Document(page_content=' propondrán'),
 Document(page_content='la'),
 Document(page_content=' aplicación,'),
 Document(page_content=' fomentando'),
 Document(page_content='así la'),
 Document(page_content=' creat

In [None]:
def get_chunks(docs, chunk_size=1000, chunk_overlap=200):
    """
    Get chunks from docs. Our loaded doc may be too long for most models, and even if it fits it can struggle to find relevant context. So we generate chunks
    :param docs: docs to be split
    :return: chunks
    """

    text_splitter = RecursiveCharacterTextSplitter( # recommended splitter for generic text. split documents recursively by different characters - starting with "\n\n", then "\n", then " "
        chunk_size=chunk_size,        # max size (in terms of number of characters) of the final documents
        chunk_overlap=chunk_overlap,  # how much overlap there should be between chunks
        add_start_index=True
    )
    chunks = text_splitter.split_documents(docs)
    logger.info(f"Split {len(docs)} documents into {len(chunks)} chunks.")
    return chunks


In [None]:
chunks = get_chunks(docs)
chunks

[INFO] - Split 3 documents into 6 chunks. 


[Document(metadata={'source': 'docs/alumnos.txt', 'id': 0, 'start_index': 0}, page_content='Las profesoras son la Ing. Eugenia Piñeiro y la Ing. Marina Fuster\nLos alumnos son estudiantes de la carrera ingeniería informática del ITBA'),
 Document(metadata={'source': 'docs/materia.txt', 'id': 1, 'start_index': 0}, page_content='Denominación de la materia: Temas Avanzados de Deep Learning\n\n\n\nDocente responsable de la materia:  Eugenia Piñeiro\n\nEquipo docente: Marina Fuster y Eugenia Piñeiro\n\nCarga horaria total: 24 horas\n\nPresentación de la materia:\nEsta materia se enfoca en dos aspectos cruciales de la temática modelos de lenguaje. Por un lado, busca profundizar en el aspecto técnico de los mismos, construyendo sobre los pasos anteriores que llevan hasta su reciente desarrollo. \nPor otro lado, se tiene como objetivo entender el estado del arte y los desafíos teórico-prácticos que se encuentran abiertos en la actualidad.'),
 Document(metadata={'source': 'docs/materia.txt', 'i

 ### Note: Start_index metadata
 When you need to reassemble the chunks into the original document format, start_index helps in placing each chunk at the correct position.

 Determine where in the original document each chunk belongs.

# Embeddings

In [None]:
from langchain_community.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings( #  embedding=OpenAIEmbeddings() rate limit
        model_name='sentence-transformers/all-MiniLM-L6-v2',
        model_kwargs={'device': 'cpu'} #TODO CHANGE
)


  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
[DEBUG] - Creating converter from 7 to 5 
[DEBUG] - Creating converter from 5 to 7 
[DEBUG] - Creating converter from 7 to 5 
[DEBUG] - Creating converter from 5 to 7 
[DEBUG] - Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client. 
[DEBUG] - etils.epath found. Using etils.epath for file I/O. 
[INFO] - NumExpr defaulting to 2 threads. 
[INFO] - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2 
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
[DEBUG] - Starting new HTTP

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 135730314967024 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock 
[DEBUG] - Lock 135730314967024 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 135730314967264 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock 
[DEBUG] - Lock 135730314967264 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock 
[DEBUG] - https://huggingface.co:443 "GET /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json HT

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 135730314967264 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock 
[DEBUG] - Lock 135730314967264 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/README.md HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 135730314964576 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock 
[DEBUG] - Lock 135730314964576 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock 
[DEBUG] - https://huggingface.co:443 "GET /sentence-transformers/all-MiniLM-L6-v2/resolve/main/README.md HTTP/1.1" 200 10659 


README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 135730314964576 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock 
[DEBUG] - Lock 135730314964576 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/1.1" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/sentence_bert_config.json HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 135730315413296 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock 
[DEBUG] - Lock 135730315413296 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock 
[DEBUG] - htt

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 135730315413296 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock 
[DEBUG] - Lock 135730315413296 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config.json HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 135730314968320 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock 
[DEBUG] - Lock 135730314968320 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock 
[DEBUG] - https://huggingface.co:443 "GET /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config.json HTTP/1.1" 200 612 


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 135730314968320 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock 
[DEBUG] - Lock 135730314968320 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/model.safetensors HTTP/1.1" 302 0 
[DEBUG] - Attempting to acquire lock 135730303334912 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock 
[DEBUG] - Lock 135730303334912 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock 
[DEBUG] - Starting new HTTPS connection (1): cdn-lfs.huggingface.co:443 
[DEBUG] - https://cdn-lfs.

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 135730303334912 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock 
[DEBUG] - Lock 135730303334912 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer_config.json HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 135730303327568 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock 
[DEBUG] - Lock 135730303327568 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock 
[DEBUG] - https://huggingface.co:443 "GET /sentence-transformers/all-MiniLM-L6-v2/resolve/main/

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 135730303327568 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock 
[DEBUG] - Lock 135730303327568 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/vocab.txt HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 135730303595664 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock 
[DEBUG] - Lock 135730303595664 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock 
[DEBUG] - https://huggingface.co:443 "GET /sentence-transformers/all-MiniLM-L6-v2/resolve/main/vocab.txt HTTP/1.1" 200 231508 


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 135730303595664 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock 
[DEBUG] - Lock 135730303595664 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer.json HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 135730303598496 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock 
[DEBUG] - Lock 135730303598496 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock 
[DEBUG] - https://huggingface.co:443 "GET /sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer.json HTTP/1.1" 200 466247 


tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 135730303598496 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock 
[DEBUG] - Lock 135730303598496 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/added_tokens.json HTTP/1.1" 404 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/special_tokens_map.json HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 135730303598496 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock 
[DEBUG] - Lock 135730303598496 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock 
[DEBUG] - 

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 135730303598496 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock 
[DEBUG] - Lock 135730303598496 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock 
[DEBUG] - https://huggingface.co:443 "GET /api/models/sentence-transformers/all-MiniLM-L6-v2/revision/main HTTP/1.1" 200 6155 
[DEBUG] - Starting new HTTPS connection (1): huggingface.co:443 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/8b3219a92973c328a8e22fadcfa821b5dc75636a/1_Pooling/config.json HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 135730255231344 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock 
[DEBUG] - Lock 135730255231344 acquired on /root/.cache/huggingface/hub/.locks/models--se

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 135730255231344 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock 
[DEBUG] - Lock 135730255231344 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock 
[DEBUG] - https://huggingface.co:443 "GET /api/models/sentence-transformers/all-MiniLM-L6-v2 HTTP/1.1" 200 6155 


In [None]:
vector = embeddings.embed_query("Hola como estas?")
embedding_size = len(vector)  # HF 384 ; OPENAI 1536

Por que es mejor/peor la longitud del embedding vector

# Vector Store

In [None]:
import pinecone
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore

def get_vector_store(index_name, embeddings, embedding_size=384):

  pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])

  if INDEX_NAME not in pc.list_indexes().names():
      pc.create_index(
          name=INDEX_NAME,
          dimension=embedding_size,
          metric="cosine",
          spec=ServerlessSpec(
              cloud='aws',
              region='us-east-1'
          )
      )

  vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)

  return vectorstore

In [None]:
INDEX_NAME = "langchain-demo-index"
vectorstore = get_vector_store(INDEX_NAME, embeddings, embedding_size)

[INFO] - Discovering subpackages in _NamespacePath(['/usr/local/lib/python3.10/dist-packages/pinecone_plugins']) 
[INFO] - Looking for plugins in pinecone_plugins.inference 
[INFO] - Installing plugin inference into PineconeGRPC 
[DEBUG] - response body: b'{"indexes":[]}' 
[DEBUG] - response body: b'{"name":"langchain-demo-index","metric":"cosine","dimension":384,"status":{"ready":false,"state":"Initializing"},"host":"langchain-demo-index-htqoby9.svc.aped-4627-b74a.pinecone.io","spec":{"serverless":{"region":"us-east-1","cloud":"aws"}},"deletion_protection":"disabled"}' 
[DEBUG] - response body: b'{"name":"langchain-demo-index","metric":"cosine","dimension":384,"status":{"ready":false,"state":"Initializing"},"host":"langchain-demo-index-htqoby9.svc.aped-4627-b74a.pinecone.io","spec":{"serverless":{"region":"us-east-1","cloud":"aws"}},"deletion_protection":"disabled"}' 
[DEBUG] - response body: b'{"name":"langchain-demo-index","metric":"cosine","dimension":384,"status":{"ready":true,"st

In [None]:
vectorstore.add_documents(chunks)

[DEBUG] - response body: b'{"upsertedCount":6}' 


['bde4d7bc-1156-462b-97b4-0e49e8cf8041',
 'c88e72f5-1e92-41b3-b2e3-940b3c0baecb',
 'e34d9142-65a4-438d-bbbd-c48dac30a87b',
 '200e6cf8-b9f5-4303-9778-96d342e3e6ab',
 'f276e34b-ef0d-48c8-b012-f07c3340ca75',
 '79406386-0bac-4d1f-8f87-f8622bc310f6']

### Similarity Search
- similarity: It retrieves the documents that are most similar to the query based on their embeddings, cos similarity

- MMR: Maximal Marginal Relevance balances the relevance of documents with the diversity of the results. It ensures that the returned documents are not only similar to the query but also diverse from each other

- Similarity Score Threshold: only those with a score above the threshold are included in the results.



In [None]:
query = "Quiero comer queso"
vectorstore.search(
    query=query,              # Return docs most similar to query using specified search type.
    search_type="similarity_score_threshold", # can be “similarity”, “mmr”, or “similarity_score_threshold”.
    k=2                       # return top k,
)

[DEBUG] - response body: b'{"results":[],"matches":[{"id":"bde4d7bc-1156-462b-97b4-0e49e8cf8041","score":0.387574375,"values":[],"metadata":{"id":0,"source":"docs/alumnos.txt","start_index":0,"text":"Las profesoras son la Ing. Eugenia Pi\xc3\xb1eiro y la Ing. Marina Fuster\\nLos alumnos son estudiantes de la carrera ingenier\xc3\xada inform\xc3\xa1tica del ITBA"}},{"id":"c88e72f5-1e92-41b3-b2e3-940b3c0baecb","score":0.322425842,"values":[],"metadata":{"id":1,"source":"docs/materia.txt","start_index":0,"text":"Denominaci\xc3\xb3n de la materia: Temas Avanzados de Deep Learning\\n\\n\\n\\nDocente responsable de la materia:  Eugenia Pi\xc3\xb1eiro\\n\\nEquipo docente: Marina Fuster y Eugenia Pi\xc3\xb1eiro\\n\\nCarga horaria total: 24 horas\\n\\nPresentaci\xc3\xb3n de la materia:\\nEsta materia se enfoca en dos aspectos cruciales de la tem\xc3\xa1tica modelos de lenguaje. Por un lado, busca profundizar en el aspecto t\xc3\xa9cnico de los mismos, construyendo sobre los pasos anteriores que

[Document(metadata={'id': 0.0, 'source': 'docs/alumnos.txt', 'start_index': 0.0}, page_content='Las profesoras son la Ing. Eugenia Piñeiro y la Ing. Marina Fuster\nLos alumnos son estudiantes de la carrera ingeniería informática del ITBA'),
 Document(metadata={'id': 1.0, 'source': 'docs/materia.txt', 'start_index': 0.0}, page_content='Denominación de la materia: Temas Avanzados de Deep Learning\n\n\n\nDocente responsable de la materia:  Eugenia Piñeiro\n\nEquipo docente: Marina Fuster y Eugenia Piñeiro\n\nCarga horaria total: 24 horas\n\nPresentación de la materia:\nEsta materia se enfoca en dos aspectos cruciales de la temática modelos de lenguaje. Por un lado, busca profundizar en el aspecto técnico de los mismos, construyendo sobre los pasos anteriores que llevan hasta su reciente desarrollo. \nPor otro lado, se tiene como objetivo entender el estado del arte y los desafíos teórico-prácticos que se encuentran abiertos en la actualidad.')]