In [1]:
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


#### Cargo el pdf y lo separo en chunks

In [5]:
splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)

loader = PyPDFLoader("./docs/fabian-cv.pdf")

chunks = loader.load_and_split(text_splitter=splitter)

print(f"Number of chunks: {len(chunks)}")

print(chunks[0])

Number of chunks: 6
page_content='FABIAN MASSOTTO
SOFTWARE ENGINEER
ExxonMobil
Software Engineer
Jul 2020 - Present
Specialized in design and development of cloud-based web
applications, my daily responsibilities involve every aspect of
the web development lifecycle, including frontend and backend
coding, CI/CD pipelines, infrastructure as code, and testing.
Over the past years, I have worked with various cross-
functional teams, providing guidance on web development
best practices, diagnosing application performance issues, and' metadata={'source': './docs/fabian-cv.pdf', 'page': 0}


#### Embeddings

In [3]:
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

#### Base de datos vectorial

In [6]:
PINECONE_API_KEY=os.getenv("API_KEY_PINECONE")

#Connect to DB Pinecone
pc=Pinecone(api_key=PINECONE_API_KEY)
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)
index_name = 'ceia'

if index_name in pc.list_indexes().names():
  pc.delete_index(index_name)
  print("index {} borrado".format(index_name))

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    print("index creado con el nombre: {}".format(index_name))
    pc.create_index(
        index_name,
        dimension=768,  # dimensionality of sentence-transformers/all-mpnet-base-v2
        metric='cosine',
        spec=spec
        )
else:
    print("el index con el nombre {} ya estaba creado".format(index_name))

index ceia borrado
index creado con el nombre: ceia


In [7]:
pinecone = PineconeVectorStore(
    embedding=embed_model,
    index_name="ceia",
    pinecone_api_key=PINECONE_API_KEY
)

In [8]:
print(f"Documentos a insertar: {len(chunks)}.")

inserted_ids = pinecone.add_documents(chunks)

print(f"Insertados {len(inserted_ids)} documentos con éxito.")

Documentos a insertar: 6.
Insertados 6 documentos con éxito.


#### Búsqueda

In [9]:
def search_vstore(query, vstore):
    results = vstore.similarity_search(query, k=3)
    for res in results:
        print(f"* {res.page_content} \n[{res.metadata}] \n")

In [10]:
search_vstore("who is Fabian?", pinecone)

* key to success. A problem solver at heart, I approach
challenging scenarios with a creative mindset and enjoy
finding solutions.
PROFILE
Country of origin: Argentina
Email: massotto.fabian@gmail.com
Phone: +54 9 11 3146 2971
LinkedIn: linkedin.com/in/massottofabian
GitHub: github.com/fabimass
Portfolio: fabianmassotto.vercel.app
CONTACT INFO
Python
Django
Azure
Terraform
GitHub
Docker
Backstage
Storybook 
[{'page': 0.0, 'source': './docs/fabian-cv.pdf'}] 

* FABIAN MASSOTTO
SOFTWARE ENGINEER
ExxonMobil
Software Engineer
Jul 2020 - Present
Specialized in design and development of cloud-based web
applications, my daily responsibilities involve every aspect of
the web development lifecycle, including frontend and backend
coding, CI/CD pipelines, infrastructure as code, and testing.
Over the past years, I have worked with various cross-
functional teams, providing guidance on web development
best practices, diagnosing application performance issues, and 
[{'page': 0.0, 'source': './docs/