# Crear bases de vectores de Chroma: 1, 2 ,3, 4, 5 y 6
## Cada vez que se vaya a crear una nueva base de vectores, es mejor reiniciar el kernel

In [1]:
!pip show langchain-community
!pip show chromadb
!pip show gpt4all
!pip show langchain-text-splitters

Name: langchain-community
Version: 0.2.19
Summary: Community contributed LangChain integrations.
Home-page: https://github.com/langchain-ai/langchain
Author: None
Author-email: None
License: MIT
Location: c:\users\emolt\anaconda3\lib\site-packages
Requires: SQLAlchemy, requests, tenacity, langsmith, langchain-core, numpy, PyYAML, aiohttp, langchain, dataclasses-json
Required-by: 
Name: chromadb
Version: 0.5.23
Summary: Chroma.
Home-page: None
Author: None
Author-email: Jeff Huber <jeff@trychroma.com>, Anton Troynikov <anton@trychroma.com>
License: None
Location: c:\users\emolt\anaconda3\lib\site-packages
Requires: uvicorn, tokenizers, build, tqdm, opentelemetry-exporter-otlp-proto-grpc, pydantic, mmh3, chroma-hnswlib, kubernetes, onnxruntime, typer, orjson, typing-extensions, fastapi, posthog, opentelemetry-instrumentation-fastapi, opentelemetry-api, rich, bcrypt, numpy, overrides, grpcio, importlib-resources, opentelemetry-sdk, PyYAML, httpx, pypika, tenacity, graphlib-backport
Requir

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema import Document

### Se cargan los datos

In [3]:
# PARA CREAR METADATOS
import csv
from typing import Dict, List, Optional
from langchain.document_loaders.base import BaseLoader
from langchain.docstore.document import Document

class CSVLoader(BaseLoader):
    def __init__(
        self,
        file_path: str,
        source_column: Optional[str] = None,
        metadata_columns: Optional[List[str]] = None,
        csv_args: Optional[Dict] = None,
        encoding: Optional[str] = None,
    ):
        self.file_path = file_path
        self.source_column = source_column
        self.encoding = encoding
        self.csv_args = csv_args or {}
        self.metadata_columns = metadata_columns or []

    def load(self) -> List[Document]:
        docs = []
        with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
            csv_reader = csv.DictReader(csvfile, **self.csv_args)
            for i, row in enumerate(csv_reader):
                metadata = {"row": i}
                for col in self.metadata_columns:
                    if col in row:
                        metadata[col] = row[col].strip()
                content = []
                for k, v in row.items():
                    if k != self.source_column and k not in self.metadata_columns:
                        content.append(f"{k.strip()}: {v.strip()}")
                doc_content = "\n".join(content)
                doc = Document(page_content=doc_content, metadata=metadata)
                docs.append(doc)

        return docs

In [4]:
metadata_columns = ["row"]

# Instancia el CSVLoader con el archivo CSV y las columnas de metadatos
loader = CSVLoader(
    file_path = r"C:\Users\emolt\OneDrive - UMH\MASTER\TFM\BASE\cordis_data_processed.csv",
    source_column= None,  # Opcional: columna para establecer como origen
    metadata_columns=metadata_columns,
    encoding="latin1"
)

# Carga los documentos del CSV
raw_documents = loader.load()

## Chroma_db_1, Chroma_db_2, Chroma_db_3: mismos chunks (1000), distintos embeddings

In [5]:
!pip show langchain-text-splitters

Name: langchain-text-splitters
Version: 0.2.4
Summary: LangChain text splitting utilities
Home-page: https://github.com/langchain-ai/langchain
Author: None
Author-email: None
License: MIT
Location: c:\users\emolt\anaconda3\lib\site-packages
Requires: langchain-core
Required-by: langchain


In [6]:
import pkgutil
print([name for _, name, _ in pkgutil.iter_modules() if "langchain" in name])

['langchain', 'langchain_community', 'langchain_core', 'langchain_ollama', 'langchain_text_splitters']


In [7]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=1000,
    chunk_overlap=800,
    length_function=len,
    is_separator_regex=False,
)

all_splits = text_splitter.split_documents(raw_documents[0:100])
len(all_splits)

960

### Chroma_db_1: GPT4ALLEmbeddings (22,7M parámetros)

In [12]:
!pip show gpt4all

Name: gpt4all
Version: 2.8.2
Summary: Python bindings for GPT4All
Home-page: https://gpt4all.io/
Author: Nomic and the Open Source Community
Author-email: support@nomic.ai
License: UNKNOWN
Location: c:\users\emolt\anaconda3\lib\site-packages
Requires: importlib-resources, requests, tqdm
Required-by: 


In [9]:
import sys
print(sys.executable)  # Should point to your py310real environment
print(sys.path)  # Should include your env's site-packages

C:\Users\emolt\anaconda3\envs\py310real\python.exe
['C:\\Users\\emolt\\anaconda3\\envs\\py310real\\python310.zip', 'C:\\Users\\emolt\\anaconda3\\envs\\py310real\\DLLs', 'C:\\Users\\emolt\\anaconda3\\envs\\py310real\\lib', 'C:\\Users\\emolt\\anaconda3\\envs\\py310real', '', 'C:\\Users\\emolt\\anaconda3\\envs\\py310real\\lib\\site-packages', 'C:\\Users\\emolt\\anaconda3\\envs\\py310real\\lib\\site-packages\\win32', 'C:\\Users\\emolt\\anaconda3\\envs\\py310real\\lib\\site-packages\\win32\\lib', 'C:\\Users\\emolt\\anaconda3\\envs\\py310real\\lib\\site-packages\\Pythonwin']


In [11]:
from langchain.embeddings import GPT4AllEmbeddings
emb = GPT4AllEmbeddings()

In [13]:
# Para crear la base de vectores
vectorstore = Chroma.from_documents(documents=all_splits, embedding=emb, persist_directory="./chroma_db_1")

### Chroma_db_2: bge-large-en (335M parámetros)

In [14]:
# SE USA UNA DE LAS GPU
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma

model_name = "BAAI/bge-large-en"
model_kwargs = {"device": "cpu"} # Al no tener NVIDIA es necesario cambiarlo model_kwargs = {'device': 'cuda:0'}
encode_kwargs = {"normalize_embeddings": True}

emb = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

  emb = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
  from .autonotebook import tqdm as notebook_tqdm


In [15]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=emb, persist_directory="./chroma_db_2")

### Chroma_db_3: all-mpnet-base-v2 (109M parámetros)

In [16]:
# SE USA UNA DE LAS GPU
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

model_name = "all-mpnet-base-v2"
model_kwargs = {"device": "cpu"}  # specify GPU device

emb = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

  emb = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)


In [17]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=emb, persist_directory="./chroma_db_3")

## Chroma_db_4, Chroma_db_5, Chroma_db_5: mismos chunks (500), distintos embeddings

In [18]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=500,
    chunk_overlap=400,
    length_function=len,
    is_separator_regex=False,
)

all_splits = text_splitter.split_documents(raw_documents)
len(all_splits)

4113930

### Chroma_db_4: GPT4ALLEmbeddings (22,7M parámetros)

In [19]:
from langchain_community.embeddings import GPT4AllEmbeddings

emb = GPT4AllEmbeddings()

In [None]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=emb, persist_directory="./chroma_db_4")

### Chroma_db_5: bge-large-en (335M parámetros)

In [None]:
# SE USA UNA DE LAS GPU
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma

model_name = "BAAI/bge-large-en"
#model_kwargs = {'device': 'cuda:1'}
model_kwargs = {"device": "cpu"}  
encode_kwargs = {"normalize_embeddings": True}

emb = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

In [None]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=emb, persist_directory="./chroma_db_5")

### Chroma_db_3: all-mpnet-base-v2 (109M parámetros)

In [None]:
# SE USA UNA DE LAS GPU
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

model_name = "all-mpnet-base-v2"
#model_kwargs = {'device': 'cuda:1'}  # specify GPU device
model_kwargs = {"device": "cpu"} 
emb = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
vectorstore = Chroma.from_documents(documents=all_splits, embedding=emb, persist_directory="./chroma_db_6")