# Extracción de Información

In [5]:
folder_path = "../CosrpusRI.csv"

In [6]:
# %pip install pinecone
# %pip install Datasets
# %pip install langchain_text_splitters
# %pip install langchain_community

In [7]:
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import numpy as np
from pinecone import Pinecone
import os
from dotenv import load_dotenv

print(torch.cuda.is_available())

True


In [8]:
# Cargar variables de entorno
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [9]:
# Cargar el archivo CSV
df = pd.read_csv("../CorpusRI.csv")  # Asegúrate de que el archivo CSV esté en la misma carpeta o ajusta la ruta


In [10]:
import re
def clean_text(text):
    text = text.lower()
    #text = text.replace(r"\n", " ").replace(r"\r", " ").replace(r"\t", " ").replace(r"\C","")
    text = re.sub(r"\[.*?\]", "", text) 
    return text.strip()


In [11]:
df = df[['ID', 'Candidato', 'Entrevista']].dropna()  # Filtrar solo las columnas necesarias
documentos = df['Entrevista'].apply(clean_text).tolist()

In [12]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  
    chunk_overlap=100,  
    add_start_index=True,
    strip_whitespace=True,
    separators=["\n", "."]  # Separadores más comunes: salto de línea y punto
)

In [13]:
docs_processed = text_splitter.create_documents(documentos)


In [14]:
# Cargar modelo de embeddings
EMBEDDING_MODEL_NAME = "thenlper/gte-small"
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)

# Mover el modelo a la GPU
embedding_model.client = embedding_model.client.to("cuda")

In [15]:

# Generar embeddings
emb = [embedding_model.embed_query(doc.page_content) for doc in docs_processed]


KeyboardInterrupt: 

In [None]:
import numpy as np
np.array(emb).shape

In [None]:
from pinecone import Pinecone
# Inicializar Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("test-index-1")


In [None]:
upsert_data = []

for i, entry in tqdm(enumerate(docs_processed[:10])):
    text = entry.page_content
    vector = embedding_model.embed_query(text)
    upsert_data.append(
        {
            "id": "vec{}".format(i),
            "values": vector,
            "metadata": {"text": text}
        }
    )

In [None]:
index.upsert(
    vectors=upsert_data,
    namespace= "ns1"
)