In [1]:
import os
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
#from dotenv import load_dotenv
from tqdm import tqdm
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
#from langchain_openai  import OpenAIEmbeddings

## To run Hugging Face OpenSource models
# Needs to manually install Visual C++ Tools from: https://visualstudio.microsoft.com/visual-cpp-build-tools/
#from InstructorEmbedding import INSTRUCTOR
#from langchain_community.embeddings import HuggingFaceInstructEmbeddings
import warnings, re
#from nltk.tokenize import word_tokenize
#from nltk.stem import WordNetLemmatizer

# Suppress all warnings
warnings.filterwarnings("ignore")

### Set up embedding model to use with GPU

In [2]:
import torch
import torch.nn as nn

# Check if CUDA is available
print(f"CUDA Available: {torch.cuda.is_available()}")

# Print CUDA device name
if torch.cuda.is_available():
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CUDA Available: True
Device Name: NVIDIA GeForce RTX 3050


### Embedding model definition
A technique for representing text data as numerical vectors, which can be input into machine learning models. The embedding model is responsible for converting text into these vectors.

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoModel

#Check HuggingFace leaderboard: https://huggingface.co/spaces/mteb/leaderboard
#dunzhang/stella_en_1.5B_v5
#Alibaba-NLP/gte-large-en-v1.5
#jinaai/jina-embeddings-v2-base-en
model = AutoModel.from_pretrained('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True) 

model_name = "Alibaba-NLP/gte-large-en-v1.5"
model_kwargs = {'device': 'cuda:0', 'trust_remote_code':True}
encode_kwargs = {'normalize_embeddings': True}
embedding_model_via_Transformers_class = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

### NO CHUNK Strategy

In [4]:
# Define the persistent directory containing the VectorDB
script_dir =  os.getcwd()
persistent_dir = os.path.abspath(os.path.join(script_dir,'..' ,'index', 'Alibaba-extended'))

# Importante Leer columna de codigo como string, sino se eliminan los zeros a la izquierda.
df = pd.read_csv('..\data\WebScrap_CSVs\hts_codes_WebScrapped.csv', encoding='utf-8', dtype={'HTS code': str})

# Split data and metadata
texts = df['Description'].tolist()  # This is the text data that will be embedded
metadata = df['HTS code'].tolist()  # This is the metadata that will be stored alongside the embeddings

documents = []
for i, text in enumerate(texts):
    document = Document(page_content=text, metadata={"source": metadata[i]})
    documents.append(document)

# Debug to check metadata + text
print(documents[0])
print(documents[1])    
print("Amount of documents is: " , len(documents))   

vector_db = None
with tqdm(total=len(documents), desc="Creando embeddings...") as pbar:
    for d in documents:
        if vector_db:
            vector_db.add_documents([d])
        else:
            #When no GPU is available
            #vector_db = Chroma.from_documents([d],embed_model, persist_directory=persistent_dir )
            
            #To enable embeddings running on GPU: embedding and ingesting at the same time
            vector_db = Chroma.from_documents([d],embedding_model_via_Transformers_class, persist_directory=persistent_dir)
        pbar.update(1)    

page_content='Chapter 10   Cereals' metadata={'source': '10'}
page_content='Wheat and meslin.' metadata={'source': '10.01'}
Amount of documents is:  23514


Creando embeddings...: 100%|██████████| 23514/23514 [14:31<00:00, 26.98it/s]


### SHORT SIZE chunk strategy

In [5]:
# Define the persistent directory containing the VectorDB
script_dir = os.getcwd()
persistent_dir = os.path.abspath(os.path.join(script_dir, '..', 'index', 'Alibaba-chunk50-overlap0'))

# Step 1: Read the .txt file
file_path = '..\data\hs_code_dictionary.txt'  # Path to your .txt file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Function to extract metadata (HS code) and content
def extract_metadata_and_content(line):
    match = re.match(r"(\d{4}\.\d{2})\s*(.*)", line)
    if match:
        metadata = match.group(1)  # The number part as metadata
        content = match.group(2)   # The rest of the line as content
        return metadata, content
    return None, line  # In case no match, return the line as-is

# Step 2: Split the text into lines first
lines = text.split("\n")

# Initialize the RecursiveCharacterTextSplitter
chunk_size = 50  # Define your chunk size
chunk_overlap = 0  # Set to 0 if you don't want overlapping chunks

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

# Step 3: Process each line, extract metadata, split content, and create Document objects
documents = []
for line in lines:
    metadata, content = extract_metadata_and_content(line)
    
    if metadata:  # If we found valid metadata
        # Split the content into chunks
        chunks = text_splitter.split_text(content)
        
        # Create a Document object for each chunk, with metadata containing the HS code
        for chunk in chunks:
            document = Document(page_content=chunk, metadata={"hs_code": metadata, "source": file_path})
            documents.append(document)

# Step 4: Embed the documents into the vector database
vector_db = None
with tqdm(total=len(documents), desc="Creando embeddings...") as pbar:
    for d in documents:
        if vector_db:
            vector_db.add_documents([d])
        else:
            # When no GPU is available, initialize vector_db
            #vector_db = Chroma.from_documents([d], embed_model, persist_directory=persistent_dir)
            
            # To enable embeddings running on GPU, ingest documents and create embeddings
            vector_db = Chroma.from_documents([d], embedding_model_via_Transformers_class, persist_directory=persistent_dir)
        pbar.update(1)

# The 'documents' list now contains Document objects with metadata and chunks, indexed into the vector database


Creando embeddings...: 100%|██████████| 10401/10401 [06:08<00:00, 28.22it/s]
