In [1]:
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


#### Cargo el pdf y lo separo en chunks

In [9]:
splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)

loader = PyPDFLoader("./docs/mock-cv.pdf")

chunks = loader.load_and_split(text_splitter=splitter)

print(f"Number of chunks: {len(chunks)}")

print(chunks[0])

Number of chunks: 5
page_content='EDUCATION
RICHARD SANCHEZ
MARKETING MANAGER 
CONTACT
+123-456-7890
hello@reallygreatsite.com
123 Anywhere St., Any City
www.reallygreatsite.com
SKILLS
Project Management
Public Relations
Teamwork
Time Management
Leadership
Effective Communication
Critical Thinking
WARDIERE UNIVERSITY
Master of Business
Management
2029 - 2030
2025 - 2029 
WARDIERE UNIVERSITY
Bachelor of Business
GPA: 3.8 / 4.0
English (Fluent)
French (Fluent)
German (Basics)
Spanish (Intermediate)
LANGUAGES
WORK EXPERIENCE
REFERENCE
PROFILE' metadata={'source': './docs/mock-cv.pdf', 'page': 0}


#### Embeddings

In [3]:
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

#### Base de datos vectorial

In [10]:
PINECONE_API_KEY=os.getenv("API_KEY_PINECONE")

#Connect to DB Pinecone
pc=Pinecone(api_key=PINECONE_API_KEY)
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)
index_name = 'mock-cv'

if index_name in pc.list_indexes().names():
  pc.delete_index(index_name)
  print("index {} borrado".format(index_name))

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    print("index creado con el nombre: {}".format(index_name))
    pc.create_index(
        index_name,
        dimension=768,  # dimensionality of sentence-transformers/all-mpnet-base-v2
        metric='cosine',
        spec=spec
        )
else:
    print("el index con el nombre {} ya estaba creado".format(index_name))

index creado con el nombre: mock-cv


In [12]:
pinecone = PineconeVectorStore(
    embedding=embed_model,
    index_name="mock-cv",
    pinecone_api_key=PINECONE_API_KEY
)

In [13]:
print(f"Documentos a insertar: {len(chunks)}.")

inserted_ids = pinecone.add_documents(chunks)

print(f"Insertados {len(inserted_ids)} documentos con éxito.")

Documentos a insertar: 5.
Insertados 5 documentos con éxito.
