# Crear bases de vectores de Chroma: 1, 2 ,3, 4, 5 y 6
## Cada vez que se vaya a crear una nueva base de vectores, es mejor reiniciar el kernel

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema import Document

### Se cargan los datos

In [2]:
# PARA CREAR METADATOS
import csv
from typing import Dict, List, Optional
from langchain.document_loaders.base import BaseLoader
from langchain.docstore.document import Document

class CSVLoader(BaseLoader):
    def __init__(
        self,
        file_path: str,
        source_column: Optional[str] = None,
        metadata_columns: Optional[List[str]] = None,
        csv_args: Optional[Dict] = None,
        encoding: Optional[str] = None,
    ):
        self.file_path = file_path
        self.source_column = source_column
        self.encoding = encoding
        self.csv_args = csv_args or {}
        self.metadata_columns = metadata_columns or []

    def load(self) -> List[Document]:
        docs = []
        with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
            csv_reader = csv.DictReader(csvfile, **self.csv_args)
            for i, row in enumerate(csv_reader):
                metadata = {"row": i}
                for col in self.metadata_columns:
                    if col in row:
                        metadata[col] = row[col].strip()
                content = []
                for k, v in row.items():
                    if k != self.source_column and k not in self.metadata_columns:
                        content.append(f"{k.strip()}: {v.strip()}")
                doc_content = "\n".join(content)
                doc = Document(page_content=doc_content, metadata=metadata)
                docs.append(doc)

        return docs

In [3]:
metadata_columns = ["row"]

# Instancia el CSVLoader con el archivo CSV y las columnas de metadatos
loader = CSVLoader(
    file_path="cordis_data_processed_29052024.csv",
    source_column= None,  # Opcional: columna para establecer como origen
    metadata_columns=metadata_columns,
    encoding="latin1"
)

# Carga los documentos del CSV
raw_documents = loader.load()

## Chroma_db_1, Chroma_db_2, Chroma_db_3: mismos chunks (1000), distintos embeddings

In [None]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=1000,
    chunk_overlap=800,
    length_function=len,
    is_separator_regex=False,
)

all_splits = text_splitter.split_documents(raw_documents)
len(all_splits)

### Chroma_db_1: GPT4ALLEmbeddings (22,7M parámetros)

In [None]:
from langchain_community.embeddings import GPT4AllEmbeddings

emb = GPT4AllEmbeddings()

In [None]:
# Para crear la base de vectores
vectorstore = Chroma.from_documents(documents=all_splits, embedding=emb, persist_directory="./chroma_db_1")

### Chroma_db_2: bge-large-en (335M parámetros)

In [None]:
# SE USA UNA DE LAS GPU
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma

model_name = "BAAI/bge-large-en"
model_kwargs = {'device': 'cuda:1'}
encode_kwargs = {"normalize_embeddings": True}

emb = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

In [None]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=emb, persist_directory="./chroma_db_2")

### Chroma_db_3: all-mpnet-base-v2 (109M parámetros)

In [None]:
# SE USA UNA DE LAS GPU
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

model_name = "all-mpnet-base-v2"
model_kwargs = {'device': 'cuda:1'}  # specify GPU device

emb = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

In [None]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=emb, persist_directory="./chroma_db_3")

## Chroma_db_4, Chroma_db_5, Chroma_db_5: mismos chunks (500), distintos embeddings

In [4]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=500,
    chunk_overlap=400,
    length_function=len,
    is_separator_regex=False,
)

all_splits = text_splitter.split_documents(raw_documents)
len(all_splits)

4114629

### Chroma_db_4: GPT4ALLEmbeddings (22,7M parámetros)

In [None]:
from langchain_community.embeddings import GPT4AllEmbeddings

emb = GPT4AllEmbeddings()

In [None]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=emb, persist_directory="./chroma_db_4")

### Chroma_db_5: bge-large-en (335M parámetros)

In [None]:
# SE USA UNA DE LAS GPU
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma

model_name = "BAAI/bge-large-en"
model_kwargs = {'device': 'cuda:1'}
encode_kwargs = {"normalize_embeddings": True}

emb = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

In [None]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=emb, persist_directory="./chroma_db_5")

### Chroma_db_3: all-mpnet-base-v2 (109M parámetros)

In [5]:
# SE USA UNA DE LAS GPU
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

model_name = "all-mpnet-base-v2"
model_kwargs = {'device': 'cuda:1'}  # specify GPU device

emb = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
vectorstore = Chroma.from_documents(documents=all_splits, embedding=emb, persist_directory="./chroma_db_6")

  from tqdm.autonotebook import tqdm, trange
