## Import Misc data

This notebook demonstrates how to pull misc data (in this case `SciQ`) from HuggingFace and massage it into the format that Chroma wants. 

You can then export it using `export_collection_to_hf_dataset` to upload it to HuggingFace. Alternatively, you can embed this logic into your Dataset class. See `sciq.py` for an example.

In [None]:
! pip install chromadb --quiet
! pip install chroma_datasets --quiet

In [None]:
import chromadb
import uuid
from chroma_datasets.utils import import_into_chroma, load_huggingface_dataset, to_chroma_schema

dataset = load_huggingface_dataset(dataset_name="sciq", split_name="test")

# sciq has has rows with questions and supporting evidence
# we want to embed both in chroma
dataset_rows = []
for row in dataset:

    # add questions
    dataset_rows.append({
        "id": str(uuid.uuid4()),
        "document": row["question"],
        "metadata": {  
            "type": "question",
        }
    })

    # add supporting evidence
    dataset_rows.append({
        "id": str(uuid.uuid4()),
        "document": row["support"],
        "metadata": {
            "type": "supporting_evidence"
        }
    })

chroma_client = chromadb.Client()
sciq_coll = import_into_chroma(chroma_client=chroma_client, dataset=to_chroma_schema(dataset_rows))
print(sciq_coll.count())


In [None]:
res = sciq_coll.query(
    query_texts="What phenomenon makes global winds blow northeast to southwest or the reverse in the northern hemisphere and northwest to southeast or the reverse in the southern hemisphere?",
    where={"type": "supporting_evidence"},
    n_results=3
    )
print(res)