## This example walks through

1. Importing a text document
2. Chunking it
3. Embedding it using OpenAI embeddings (set your API_KEY below to follow along)
4. Loading it into a Chroma collection
5. Exporting it to a HuggingFace Dataset
6. Uploading it to HuggingFace

See `README.md` to see the rest of the instructions for opening a PR to add a dataset.

In [None]:
!pip install datasets --quiet
!pip install huggingface_hub --quiet
!pip install chromadb --quiet

import chromadb
from chromadb.utils import embedding_functions

In [None]:
## First import the dataset
with open('./examples/data/paul_graham_essay.txt', "r") as file:
            content = file.readlines()

# merge into one string
content = " ".join(content)

# split into chunks
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separator="\n")
content = text_splitter.split_text(content)
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key="API_KEY",
                model_name="text-embedding-ada-002"
            )

client = chromadb.Client()
collection = client.create_collection("paul_graham_essays", embedding_function=openai_ef)

collection.add(
    ids=[str(i) for i in range(len(content))],
    documents=content,
    metadatas=[{"author": "Paul Graham"} for i in range(len(content))]
)

In [None]:
from chroma_datasets.utils import export_collection_to_hf_dataset

dataset = export_collection_to_hf_dataset(client, "paul_graham_essays", "MIT")
dataset.push_to_hub("chromadb/paul_graham_essay", split="data")
print(dataset)