In [1]:
# Load a huggingface dataset into a search index, and try semantic filtering for a certain kind of data
# Find datasets here: https://huggingface.co/datasets 

# Relevant Documentation
# https://huggingface.co/docs/datasets/en/index 
# https://huggingface.co/docs/datasets/en/faiss_es
# https://www.sbert.net/
# https://www.sbert.net/docs/pretrained_models.html

In [None]:
from pathlib import Path

from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer, util

# Install FAISS with your preferred method (it will be required below)
# pip install faiss-cpu
# pip install faiss-gpu

In [None]:
# The dataset we want to use

DATASET_ID = "numind/NuNER"

# Choose the sentence transformers embedding model we want to use
# see here for a list of models: https://www.sbert.net/docs/pretrained_models.html
# this model is really fast for symmetric semantic search, with ok quality
# EMBEDDING_MODEL = "all-MiniLM-L6-v2"

# this model is pretty good, but it's a lot slower than "all-MiniLM-L6-v2"
EMBEDDING_MODEL = "all-mpnet-base-v2"

DEVICE = 'cpu' # use "cuda" if you prefer and have a gpu available in your environment

In [17]:
# this cell may take a while
EXAMPLES_TO_INDEX = 100000
# note that the slice semantics below still download the full dataset locally
# for huge datasets the workflow below would need to be changed to use streaming semantics
# available splits for the NuNER dataset are: ['entity', 'full'].
ds = load_dataset(DATASET_ID, split=f'full[:{EXAMPLES_TO_INDEX}]', streaming=False)

In [18]:
# load our embedding model, noting the "device" kwarg
embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)

In [19]:
type(ds)

datasets.arrow_dataset.Dataset

In [20]:
# Now lets embed all inputs from our dataset and put them into a search index
# IMPORTANT: don't get blocked by throughput on this part, reduce the size of the dataset if you need to

test_emb = embedder.encode(ds[0]['input'])
print(f'embedding type: {type(test_emb)}')
print(f'embedding shape: {test_emb.shape}')

embedding type: <class 'numpy.ndarray'>
embedding shape: (768,)


In [21]:
BATCH_SIZE = 2048

ds_with_embeddings = ds.map(
    lambda example: {
        'embeddings': embedder.encode(example['input'])
    },
    batched=True,
    batch_size=BATCH_SIZE
)

# 'embeddings': ctx_encoder(**ctx_tokenizer(example["line"], return_tensors="pt"))[0][0].numpy()

In [22]:
# put the embeddings in an in-memory FAISS index for fast semantic search
ds_with_embeddings.add_faiss_index(column="embeddings")

  0%|          | 0/100 [00:00<?, ?it/s]

Dataset({
    features: ['input', 'output', 'embeddings'],
    num_rows: 100000
})

In [23]:
# Use the query to find examples in the domain that you want to explore
query_prompt = 'clothing, short, trousers, jacket, pants, shirt, tshirt, skirt'
query_embedding = embedder.encode(query_prompt)

# K is the number of
K = 1000

scores, samples = ds_with_embeddings.get_nearest_examples(
    "embeddings", query_embedding, k=K
)

In [24]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)


for _, row in samples_df.iterrows():
    print(row['input'])
    print()

Rating: 5 My favourite thick denier stockings. Nice to wear on cooler days. Wear well, good ladder resistance, and wash well. Happy days! Rating: 3 Look great on but are short.

It comes below my hips. I bought a MIsses large because I like sweaters loose, but I should have bought a Misses medium or Petite Large.

Three body styles were tendered over the model run which were the standard coupe, sedan and station wagon variants.

Towels and bed linen are featured at Warren. Note: Map is an approximate indication ONLY. Please contact Warren for directions and precise location prior to your visit.

Scalloped lace trim at the hem. Please expect 4-6 weeks for completion. if length is not specified it will be 43"

We will gently process your garments, and, in the case of suedes and leathers, replenish any oils that may have been partially removed during the normal course of cleaning.

The iron does a pressing cloth, and 160 degrees Celsius, please. At the time of order contents input, the ad

In [25]:
type(samples)

dict

In [26]:
samples.keys()

dict_keys(['input', 'output', 'embeddings'])

In [27]:
# re-create a HF dataset from the items we retrieved
sampled_dataset = Dataset.from_dict(samples)

In [28]:
output_path = Path('data/sampled_dataset')
output_path.mkdir(exist_ok=True, parents=True)
sampled_dataset.save_to_disk(output_path)
print(f'saved dataset to {output_path}')

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

saved dataset to data/sampled_dataset


In [29]:
# OPTIONAL: do some additional manual filtering if needed to try to reduce your dataset down to the kinds of things you're really looking for
# https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.DatasetDict.filter
# IDEA: we could also filter with an LLM if we want to go faster / scale up 

In [30]:
# we're ready for the next step!