## This example walks through

1. Importing a text document
2. Chunking it
3. Embedding it using OpenAI embeddings (set your API_KEY below to follow along)
4. Loading it into a Chroma collection
5. Exporting it to a HuggingFace Dataset
6. Uploading it to HuggingFace
7. Save it to disk
8. Loading it from disk

See `README.md` to see the rest of the instructions for opening a PR to add a dataset.

In [1]:
!pip install datasets --quiet
!pip install huggingface_hub --quiet
!pip install chromadb --quiet

import chromadb
from chromadb.utils import embedding_functions

import datasets
from chroma_datasets.utils import export_collection_to_hf_dataset, export_collection_to_hf_dataset_to_disk, import_chroma_exported_hf_dataset_from_disk

In [2]:
## First import the dataset
with open('./data/paul_graham_essay.txt', "r") as file:
            content = file.readlines()

# merge into one string
content = " ".join(content)

# split into chunks
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separator="\n")
content = text_splitter.split_text(content)
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key="API_KEY",
                model_name="text-embedding-ada-002"
            )

client = chromadb.Client()
collection = client.create_collection("paul_graham_essays", embedding_function=openai_ef)

collection.add(
    ids=[str(i) for i in range(len(content))],
    documents=content,
    metadatas=[{"author": "Paul Graham"} for i in range(len(content))]
)

Created a chunk of size 1005, which is longer than the specified 1000
Created a chunk of size 1204, which is longer than the specified 1000
Created a chunk of size 1026, which is longer than the specified 1000


In [3]:
# create a HuggingFace Collection and upload it to the Hub
dataset = export_collection_to_hf_dataset(client, "paul_graham_essays", "MIT")
dataset.push_to_hub("chromadb/paul_graham_essay", split="data")
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm
                                                                                            

Saved dataset to ./data/paul_graham_essay2




In [4]:
# Export the collection to disk
export_collection_to_hf_dataset_to_disk(client, "paul_graham_essays", "./data/paul_graham_essay2", "MIT")

In [5]:
# Import the collection from disk
collection = import_chroma_exported_hf_dataset_from_disk(client, "./data/paul_graham_essay2", "paul_graham_essay_test")
collection.peek()

Caution: No embedding function provided. Using the default embedding function.
Loaded 104 documents into the collection named: paul_graham_essay_test


{'ids': ['0', '103', '2', '3', '4', '5', '6', '7', '8', '9'],
 'embeddings': [[-3.437051418586634e-05,
   -0.01011231355369091,
   -0.008068406023085117,
   -0.03267570585012436,
   -0.008148822002112865,
   0.028574489057064056,
   -0.034230414777994156,
   -0.009261243976652622,
   -0.007002894300967455,
   -0.01901838555932045,
   0.018696721643209457,
   0.042057573795318604,
   -0.009938078932464123,
   -0.010400470346212387,
   0.005518548656255007,
   0.0288157369941473,
   0.01986275427043438,
   -0.006369618233293295,
   0.005605665966868401,
   -0.004714388400316238,
   -0.01460890844464302,
   -0.00622218893840909,
   0.010239638388156891,
   -0.008008094504475594,
   -0.014689324423670769,
   -0.020613303408026695,
   0.014528492465615273,
   -0.02463410422205925,
   0.020841149613261223,
   -0.010433977469801903,
   0.018039992079138756,
   -0.0204122643917799,
   -0.000993472756817937,
   -0.01255830004811287,
   -0.02496917173266411,
   -0.00787406787276268,
   -0.022355