# Dataset Stuff

### Load Dataset

In [2]:
from langchain.document_loaders import HuggingFaceDatasetLoader

# ds = load_dataset("neural-bridge/rag-dataset-12000")

# Specify the dataset name and the column containing the content
dataset_name = "neural-bridge/rag-dataset-12000"
page_content_column = "context"  # or any other column you're interested in

# Create a loader instance
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)

# Load the data
data = loader.load()

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
import numpy as np
print(np.__version__)

1.26.4


### Try to print it

In [3]:
# Display the first 15 entries
data[:2]

[Document(metadata={'question': 'What is the Berry Export Summary 2028 and what is its purpose?', 'answer': 'The Berry Export Summary 2028 is a dedicated export plan for the Australian strawberry, raspberry, and blackberry industries. It maps the sectors’ current position, where they want to be, high-opportunity markets, and next steps. The purpose of this plan is to grow their global presence over the next 10 years.'}, page_content='"Caption: Tasmanian berry grower Nic Hansen showing Macau chef Antimo Merone around his property as part of export engagement activities.\\nTHE RISE and rise of the Australian strawberry, raspberry and blackberry industries has seen the sectors redouble their international trade focus, with the release of a dedicated export plan to grow their global presence over the next 10 years.\\nDriven by significant grower input, the Berry Export Summary 2028 maps the sectors\\u2019 current position, where they want to be, high-opportunity markets and next steps.\\nH

### Split text with RecursiveCharTextSplitter

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.utils import filter_complex_metadata

# FIXME Hyperparams
# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# FIXME Filter???
filtered_data = filter_complex_metadata(data)
docs = text_splitter.split_documents(filtered_data)

### Embeddings

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings

# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)




In [8]:
for doc in docs:
    if doc.page_content is None or doc.metadata is None:
        print("Invalid document found:", doc)

### Vector Stores

##### This takes forever to run.

In [9]:
from langchain_chroma import Chroma
from langchain.globals import set_debug

set_debug(True)
db = Chroma.from_documents(docs, embeddings)

: 