In [None]:
# Load a huggingface dataset into a search index, and try semantic filtering for a certain kind of data
# Find datasets here: https://huggingface.co/datasets 

# Relevant Documentation
# https://huggingface.co/docs/datasets/en/index 
# https://huggingface.co/docs/datasets/en/faiss_es
# https://www.sbert.net/
# https://www.sbert.net/docs/pretrained_models.html

In [None]:
from pathlib import Path

from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer, util

# Install FAISS with your preferred method (it will be required below)
# pip install faiss-cpu
# pip install faiss-gpu

In [None]:
# The dataset we want to use

DATASET_ID = "numind/NuNER"

# Choose the sentence transformers embedding model we want to use
# see here for a list of models: https://www.sbert.net/docs/pretrained_models.html
# this model is really fast for symmetric semantic search, with ok quality
# EMBEDDING_MODEL = "all-MiniLM-L6-v2"

# this model is pretty good, but it's a lot slower than "all-MiniLM-L6-v2"
EMBEDDING_MODEL = "all-mpnet-base-v2"

DEVICE = 'mps' # 'cpu' or 'cuda' or 'mps'
# use 'cuda' if you have a graphics unit that has cuda cores (nvidia has them for example)
# use mps if you are using an apple machine with an M-series chip
# use 'cpu' if the above are not an option, or maybe your graphics unit has very limited memory
# or you might just be curious about cpu performance and want to observe it

In [None]:
# this cell may take a while
EXAMPLES_TO_INDEX = 100000
# note that the slice semantics below still download the full dataset locally
# for huge datasets the workflow below would need to be changed to use streaming semantics
# available splits for the NuNER dataset are: ['entity', 'full'].
ds = load_dataset(DATASET_ID, split=f'full[:{EXAMPLES_TO_INDEX}]', streaming=False)

In [None]:
# load our embedding model, noting the "device" kwarg
embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)

In [None]:
type(ds)

In [None]:
# Now lets embed all inputs from our dataset and put them into a search index
# IMPORTANT: don't get blocked by throughput on this part, reduce the size of the dataset if you need to

test_emb = embedder.encode(ds[0]['input'])
print(f'embedding type: {type(test_emb)}')
print(f'embedding shape: {test_emb.shape}')

In [None]:
BATCH_SIZE = 2048

ds_with_embeddings = ds.map(
    lambda example: {
        'embeddings': embedder.encode(example['input'])
    },
    batched=True,
    batch_size=BATCH_SIZE
)

# 'embeddings': ctx_encoder(**ctx_tokenizer(example["line"], return_tensors="pt"))[0][0].numpy()

In [None]:
# put the embeddings in an in-memory FAISS index for fast semantic search
ds_with_embeddings.add_faiss_index(column="embeddings")

In [None]:
# Use the query to find examples in the domain that you want to explore
query_prompt = 'clothing, short, trousers, jacket, pants, shirt, tshirt, skirt'
query_embedding = embedder.encode(query_prompt)

# K is the number of
K = 1000

scores, samples = ds_with_embeddings.get_nearest_examples(
    "embeddings", query_embedding, k=K
)

In [None]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)


for _, row in samples_df.iterrows():
    print(row['input'])
    print()

In [None]:
type(samples)

In [None]:
samples.keys()

In [None]:
# re-create a HF dataset from the items we retrieved
sampled_dataset = Dataset.from_dict(samples)

In [None]:
output_path = Path('data/sampled_dataset')
output_path.mkdir(exist_ok=True, parents=True)
sampled_dataset.save_to_disk(output_path)
print(f'saved dataset to {output_path}')

In [None]:
# OPTIONAL: do some additional manual filtering if needed to try to reduce your dataset down to the kinds of things you're really looking for
# https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.DatasetDict.filter
# IDEA: we could also filter with an LLM if we want to go faster / scale up 

In [None]:
# we're ready for the next step!