# Load Dependencies

In [57]:
pip install openai langchain langchain-community pinecone-client python-dotenv langchain-pinecone tiktoken protoc_gen_openapiv2 sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [63]:
import openai
import langchain
import os
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
import logging

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

logging.basicConfig(level=logging.DEBUG,
                    format='[%(levelname)s] - %(message)s ',
                    handlers=[
                        logging.FileHandler('/content/langchaindemo.log', mode='w'),
                        logging.StreamHandler(),
                    ],
                    force=True)
logger = logging.getLogger(__name__)
logger.info("Langchain Demo Initialized")

[INFO] - Langchain Demo Initialized 


# Document Loader

Load data from a source as Document's. A Document is a piece of text and associated metadata.

https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.directory.DirectoryLoader.html

In [3]:
def get_docs():
    """
    Loads each file into one document (knowledge base)
    :return: docs
    """

    loader = DirectoryLoader(  # Reads custom data from local files
        path="docs",
        glob="*.txt",
        loader_cls=TextLoader  # Loader class to use for loading files
    )

    docs = loader.load()
    return docs

In [5]:
docs = get_docs()
docs[0]

Document(metadata={'source': 'docs/temario.txt'}, page_content='Contenidos:\nUnidad 1: Introducción a Transformers\nIntroducción al concepto de Gen AI, LLMs y Transformers. Historia. Arquitectura. Mecanismo de Atención. Embeddings y Positional Encoding. Aplicaciones en la industria.\n\nUnidad 2: Algoritmos de Embedding y Positional Encoding\nAlgoritmos de Embedding y Positional Encoding. Transformer basando en N-grama\n\nUnidad 3: Fine Tuning\nReinforcement Learning. RLHF y sus security issues. Fine tuning. Pipeline productivo. \n\nUnidad 4: Responsible AI\nConsideraciones éticas en AI: biases en training data, fairness, impacto social, detección de contenido generado de forma artificial. Narrow AI vs. AGI. AGI como agente. Foundation models. Emergent capabilities. Security vulnerabilities. Interpretability. Alignment.\n\nUnidad 5: Retrieval Augmented Generation (RAG)\nIntroducción Retrieval Augmented Generation. Bases de datos vectoriales: Chroma DB y Pinecone, uso de embeddings y efi

## Metadata

It can often be useful to tag ingested documents with structured metadata, such as the title, tone, or length of a document, to allow for more targeted similarity search later. (organizing, filtering, additional info)

In [46]:
for i, doc in enumerate(docs):
  doc.metadata.update({"id": i})

docs

[Document(metadata={'source': 'docs/temario.txt', 'id': 0}, page_content='Contenidos:\nUnidad 1: Introducción a Transformers\nIntroducción al concepto de Gen AI, LLMs y Transformers. Historia. Arquitectura. Mecanismo de Atención. Embeddings y Positional Encoding. Aplicaciones en la industria.\n\nUnidad 2: Algoritmos de Embedding y Positional Encoding\nAlgoritmos de Embedding y Positional Encoding. Transformer basando en N-grama\n\nUnidad 3: Fine Tuning\nReinforcement Learning. RLHF y sus security issues. Fine tuning. Pipeline productivo. \n\nUnidad 4: Responsible AI\nConsideraciones éticas en AI: biases en training data, fairness, impacto social, detección de contenido generado de forma artificial. Narrow AI vs. AGI. AGI como agente. Foundation models. Emergent capabilities. Security vulnerabilities. Interpretability. Alignment.\n\nUnidad 5: Retrieval Augmented Generation (RAG)\nIntroducción Retrieval Augmented Generation. Bases de datos vectoriales: Chroma DB y Pinecone, uso de embedd

# Text Splitters

Split a long document into smaller chunks that can fit into your model's context window

https://js.langchain.com/v0.1/docs/modules/data_connection/document_transformers/

Los separadores pueden ser pasados como argumento y se puede usar la funcion 'create_documents' si no utilizamos un DocumentLoader

In [47]:
text_splitter_test = RecursiveCharacterTextSplitter(
        separators = ["\n\n", "\n", ".", " "],
        chunk_size = 50,
        chunk_overlap= 2
    )
documents = ["Actividades prácticas previstas:La materia involucra el desarrollo de un Transformer en Python permitiendo el uso de librerías y utilizando técnicas como RAG y finetuning.\n\nLos alumnos propondrán la aplicación, fomentando así la creatividad. Asimismo, se proporcionarán papers relacionados con los conceptos de la materia, los cuales los alumnos deberán debatir y exponer"]
chunks_test = text_splitter_test.create_documents(documents)
chunks_test

[Document(page_content='Actividades prácticas previstas:La materia'),
 Document(page_content='involucra el desarrollo de un Transformer en'),
 Document(page_content='Python permitiendo el uso de librerías y'),
 Document(page_content='y utilizando técnicas como RAG y finetuning'),
 Document(page_content='.'),
 Document(page_content='Los alumnos propondrán la aplicación, fomentando'),
 Document(page_content='así la creatividad'),
 Document(page_content='. Asimismo, se proporcionarán papers relacionados'),
 Document(page_content='con los conceptos de la materia, los cuales los'),
 Document(page_content='alumnos deberán debatir y exponer')]

In [48]:
def get_chunks(docs, chunk_size=1000, chunk_overlap=200):
    """
    Get chunks from docs. Our loaded doc may be too long for most models, and even if it fits it can struggle to find relevant context. So we generate chunks
    :param docs: docs to be split
    :return: chunks
    """

    text_splitter = RecursiveCharacterTextSplitter( # recommended splitter for generic text. split documents recursively by different characters - starting with "\n\n", then "\n", then " "
        chunk_size=chunk_size,        # max size (in terms of number of characters) of the final documents
        chunk_overlap=chunk_overlap,  # how much overlap there should be between chunks
        add_start_index=True
    )
    chunks = text_splitter.split_documents(docs)
    logger.info(f"Split {len(docs)} documents into {len(chunks)} chunks.")
    return chunks


In [49]:
chunks = get_chunks(docs)
chunks

[INFO] - Split 3 documents into 6 chunks. 


[Document(metadata={'source': 'docs/temario.txt', 'id': 0, 'start_index': 0}, page_content='Contenidos:\nUnidad 1: Introducción a Transformers\nIntroducción al concepto de Gen AI, LLMs y Transformers. Historia. Arquitectura. Mecanismo de Atención. Embeddings y Positional Encoding. Aplicaciones en la industria.\n\nUnidad 2: Algoritmos de Embedding y Positional Encoding\nAlgoritmos de Embedding y Positional Encoding. Transformer basando en N-grama\n\nUnidad 3: Fine Tuning\nReinforcement Learning. RLHF y sus security issues. Fine tuning. Pipeline productivo. \n\nUnidad 4: Responsible AI\nConsideraciones éticas en AI: biases en training data, fairness, impacto social, detección de contenido generado de forma artificial. Narrow AI vs. AGI. AGI como agente. Foundation models. Emergent capabilities. Security vulnerabilities. Interpretability. Alignment.\n\nUnidad 5: Retrieval Augmented Generation (RAG)\nIntroducción Retrieval Augmented Generation. Bases de datos vectoriales: Chroma DB y Pinec

 ### Note: Start_index metadata
 When you need to reassemble the chunks into the original document format, start_index helps in placing each chunk at the correct position.

 Determine where in the original document each chunk belongs.

# Embeddings

In [58]:
from langchain_community.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings( #  embedding=OpenAIEmbeddings() rate limit
        model_name='sentence-transformers/all-MiniLM-L6-v2',
        model_kwargs={'device': 'cpu'}
)


[DEBUG] - close.started 
[DEBUG] - close.complete 
  from tqdm.autonotebook import tqdm, trange
[DEBUG] - Creating converter from 7 to 5 
[DEBUG] - Creating converter from 5 to 7 
[DEBUG] - Creating converter from 7 to 5 
[DEBUG] - Creating converter from 5 to 7 
[DEBUG] - Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client. 
[DEBUG] - etils.epath found. Using etils.epath for file I/O. 
[INFO] - NumExpr defaulting to 2 threads. 
[INFO] - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2 
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datas

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 137271201562192 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock 
[DEBUG] - Lock 137271201562192 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 137271201561424 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock 
[DEBUG] - Lock 137271201561424 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock 
[DEBUG] - https://huggingface.co:443 "GET /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json HT

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 137271201561424 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock 
[DEBUG] - Lock 137271201561424 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/README.md HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 137271201561088 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock 
[DEBUG] - Lock 137271201561088 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock 
[DEBUG] - https://huggingface.co:443 "GET /sentence-transformers/all-MiniLM-L6-v2/resolve/main/README.md HTTP/1.1" 200 10659 


README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 137271201561088 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock 
[DEBUG] - Lock 137271201561088 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/1.1" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/sentence_bert_config.json HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 137271201561232 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock 
[DEBUG] - Lock 137271201561232 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock 
[DEBUG] - htt

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 137271201561232 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock 
[DEBUG] - Lock 137271201561232 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config.json HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 137271201561520 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock 
[DEBUG] - Lock 137271201561520 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock 
[DEBUG] - https://huggingface.co:443 "GET /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config.json HTTP/1.1" 200 612 


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 137271201561520 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock 
[DEBUG] - Lock 137271201561520 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/model.safetensors HTTP/1.1" 302 0 
[DEBUG] - Attempting to acquire lock 137271189863728 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock 
[DEBUG] - Lock 137271189863728 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock 
[DEBUG] - Starting new HTTPS connection (1): cdn-lfs.huggingface.co:443 
[DEBUG] - https://cdn-lfs.

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 137271189863728 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock 
[DEBUG] - Lock 137271189863728 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer_config.json HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 137271190107808 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock 
[DEBUG] - Lock 137271190107808 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock 
[DEBUG] - https://huggingface.co:443 "GET /sentence-transformers/all-MiniLM-L6-v2/resolve/main/

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 137271190107808 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock 
[DEBUG] - Lock 137271190107808 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/vocab.txt HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 137271190607456 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock 
[DEBUG] - Lock 137271190607456 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock 
[DEBUG] - https://huggingface.co:443 "GET /sentence-transformers/all-MiniLM-L6-v2/resolve/main/vocab.txt HTTP/1.1" 200 231508 


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 137271190607456 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock 
[DEBUG] - Lock 137271190607456 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer.json HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 137271190604768 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock 
[DEBUG] - Lock 137271190604768 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock 
[DEBUG] - https://huggingface.co:443 "GET /sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer.json HTTP/1.1" 200 466247 


tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 137271190604768 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock 
[DEBUG] - Lock 137271190604768 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/added_tokens.json HTTP/1.1" 404 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/special_tokens_map.json HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 137271190106464 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock 
[DEBUG] - Lock 137271190106464 acquired on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock 
[DEBUG] - 

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 137271190106464 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock 
[DEBUG] - Lock 137271190106464 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock 
[DEBUG] - https://huggingface.co:443 "GET /api/models/sentence-transformers/all-MiniLM-L6-v2/revision/main HTTP/1.1" 200 6159 
[DEBUG] - Starting new HTTPS connection (1): huggingface.co:443 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/8b3219a92973c328a8e22fadcfa821b5dc75636a/1_Pooling/config.json HTTP/1.1" 200 0 
[DEBUG] - Attempting to acquire lock 137271190604336 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock 
[DEBUG] - Lock 137271190604336 acquired on /root/.cache/huggingface/hub/.locks/models--se

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[DEBUG] - Attempting to release lock 137271190604336 on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock 
[DEBUG] - Lock 137271190604336 released on /root/.cache/huggingface/hub/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock 
[DEBUG] - https://huggingface.co:443 "GET /api/models/sentence-transformers/all-MiniLM-L6-v2 HTTP/1.1" 200 6159 


384

In [73]:
vector = embeddings.embed_query("Hola como estas?")
embedding_size = len(vector)  # HF 384 ; OPENAI 1536

# Vector Store

In [71]:
import pinecone
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore

def get_vector_store(index_name, embeddings, embedding_size=384):

  pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])

  if INDEX_NAME not in pc.list_indexes().names():
      pc.create_index(
          name=INDEX_NAME,
          dimension=embedding_size,
          metric="cosine",
          spec=ServerlessSpec(
              cloud='aws',
              region='us-east-1'
          )
      )

  vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)

  return vectorstore

In [74]:
INDEX_NAME = "langchain-demo-index"
vectorstore = get_vector_store(INDEX_NAME, embeddings, embedding_size)

In [75]:
vectorstore.add_documents(chunks)

['9ed21924-179f-4014-94bc-f9605f06f91a',
 '2cd2609a-c5f9-4e64-85b1-49c43ee019a4',
 'b8963250-c36e-428a-ae89-6d3beb65bc96',
 'b7f4ba26-0cce-43af-97b7-6b121090289f',
 'e144a94e-6f47-4a4c-be60-3314ab92d044',
 'bf0fdd08-49c1-4d2f-958c-aa578044e962']

### Similarity Search
- similarity: It retrieves the documents that are most similar to the query based on their embeddings, cos similarity

- MMR: balances the relevance of documents with the diversity of the results. It ensures that the returned documents are not only similar to the query but also diverse from each other

- Similarity Score Threshold: only those with a score above the threshold are included in the results.



In [80]:
query = "Quien es Eugenia Piñeiro?"
vectorstore.search(
    query=query,              # Return docs most similar to query using specified search type.
    search_type="similarity", # can be “similarity”, “mmr”, or “similarity_score_threshold”.
    k=2                       # return top k
)

[Document(metadata={'id': 1.0, 'source': 'docs/alumnos.txt', 'start_index': 0.0}, page_content='Las profesoras son la Ing. Eugenia Piñeiro y la Ing. Marina Fuster\nLos alumnos son estudiantes de la carrera ingeniería informática del ITBA: \tABANCENS, ALBERTO; BALIARDA, GONZALO NAHUEL; BIRSA, NICOLAS EZEQUIEL; BRAVE, JERONIMO; CASTAGNINO, SALVADOR'),
 Document(metadata={'id': 2.0, 'source': 'docs/materia.txt', 'start_index': 0.0}, page_content='Denominación de la materia: Temas Avanzados de Deep Learning\n\n\n\nDocente responsable de la materia:  Eugenia Piñeiro\n\nEquipo docente: Marina Fuster y Eugenia Piñeiro\n\nCarga horaria total: 24 horas\n\nPresentación de la materia:\nEsta materia se enfoca en dos aspectos cruciales de la temática modelos de lenguaje. Por un lado, busca profundizar en el aspecto técnico de los mismos, construyendo sobre los pasos anteriores que llevan hasta su reciente desarrollo. \nPor otro lado, se tiene como objetivo entender el estado del arte y los desafíos