In [1]:
import os
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
#import dotenv
from tqdm import tqdm
from langchain_community.vectorstores import Chroma
from langchain.schema import Document

## To run Hugging Face OpenSource models
# Needs to manually install Visual C++ Tools from: https://visualstudio.microsoft.com/visual-cpp-build-tools/
from InstructorEmbedding import INSTRUCTOR
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import trange


In [2]:
import torch
import torch.nn as nn

# Check if CUDA is available
print(f"CUDA Available: {torch.cuda.is_available()}")

# Print CUDA device name
if torch.cuda.is_available():
    print(f"Device Name: {torch.cuda.get_device_name(0)}")

CUDA Available: True
Device Name: NVIDIA GeForce RTX 3050


In [3]:
# Ensure GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### 2.- Chunking
Is the process of breaking down a large input text into smaller pieces.
This ensures that the text fits the input size of the embedding model and improves retrieval efficiency.

In [11]:
# Define directories
# Define the persistent directory containing the VectorDB
script_dir =  os.getcwd()
persistent_dir = os.path.abspath(os.path.join(script_dir,'..' ,'index', 'WebScrap_CSVs'))

# Importante Leer columna de codigo como string, sino se eliminan los zeros a la izquierda.
df = pd.read_csv('hts_codes_WebScrapped.csv', encoding='utf-8', dtype={'HTS code': str})

# Split data and metadata
texts = df['Description'].tolist()  # This is the text data that will be embedded
metadata = df['HTS code'].tolist()  # This is the metadata that will be stored alongside the embeddings


# WARNING! :Only runs with this version
###### !pip install sentence-transformers==2.2.2  ######
#Define the sentence-transformer model:

#For English
#sentence-transformers/LaBSE
#embed_model = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = "sentence-transformers/all-mpnet-base-v2"

#For Spanish 
#projecte-aina/aguila-7b
#embed_model = "hiiamsid/sentence_similarity_spanish_es"

#Other sentence-transformer settings
model_kwargs = {'device': 'cuda:0'}  # specify GPU device
encode_kwargs = {'normalize_embeddings': True}

### 3.- Embedding

A technique for representing text data as numerical vectors, which can be input into machine learning models. The embedding model is responsible for converting text into these vectors.

In [12]:
#Only runs with this version
#!pip install sentence-transformers==2.2.2

hf_embed_model = HuggingFaceInstructEmbeddings(
    model_name=embed_model,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


load INSTRUCTOR_Transformer
max_seq_length  512


### 4.- Vector DB

A collection of pre-computed vector representations of text data for fast retrieval and similarity search, with capabilities like CRUD operations, metadata filtering, and horizontal scaling.

In [14]:
chroma_db = Chroma(
    collection_name="csv_collection",  # Name for the Chroma collection
    embedding_function=hf_embed_model.embed_query,  # Function for query embeddings
    persist_directory=persistent_dir
)

documents = []
for i, text in enumerate(texts):
    document = Document(page_content=text, metadata={"source": metadata[i]})
    documents.append(document)

# Debug to check metadata + text
print(documents[0])
print(documents[1])    
print(documents[2])   

page_content='Pure-bred breeding horses' metadata={'source': '0101.21'}
page_content='Live horses (excluding pure-bred for breeding)' metadata={'source': '0101.29'}
page_content='Live asses' metadata={'source': '0101.30'}


In [15]:
vector_db = None
with tqdm(total=len(documents), desc="Creando embeddings...") as pbar:
    for d in documents:
        if vector_db:
            vector_db.add_documents([d])
        else:
            #When no GPU is available
            #vector_db = Chroma.from_documents([d],embed_model, persist_directory=persistent_dir )
            
            #To enable embeddings running on GPU: embedding and ingesting at the same time
            vector_db = Chroma.from_documents([d],hf_embed_model, persist_directory=persistent_dir)
        pbar.update(1)    

Creando embeddings...: 100%|██████████| 5541/5541 [02:16<00:00, 40.64it/s]
