# Working with Hugging Face

# 1. Getting Started with Hugging Face

## Introduction to Hugging Face

## Transformers and the Hub

### Searching the Hub with Python

In [None]:
pip install huggingface_hub
from huggingface_hub import HfApi
list(api.list_models())

# Create the instance of the API
api = HfApi()

# Return the filtered list from the Hub
models = api.list_models(
    filter=ModelFilter(task="text-classification"),
    sort="downloads",
    direction=-1,
  	limit=1
)

# Store as a list
modelList = list(models)

print(modelList[0].modelId)

### Saving a model

In [None]:
modelId = "distilbert-base-uncased-finetuned-sst-2-english"

# Instantiate the AutoModel class
model = AutoModel.from_pretrained(modelId)

# Save the model
model.save_pretrained(save_directory=f"models/{modelId}")

## Working with datasets

In [None]:
pip install datasets

# Load the module
from datasets import load_dataset_builder

# Create the dataset builder
reviews_builder = load_dataset_builder("derenrich/wikidata-en-descriptions-small")

# Print the features
print(reviews_builder.info.features)

### Loading Datasets

In [None]:
# Load the train portion of the dataset
wikipedia = load_dataset("wikimedia/wikipedia", language="20231101.en", split="train")

print(f"The length of the dataset is {len(wikipedia)}")

### Manipulating datasets

In [None]:
# Filter the documents
filtered = wikipedia.filter(lambda row: 'football' in row["text"])

# Create a sample dataset
example = filtered.select(range(1))

print(example[0]["text"])

# 2. Building Pipelines with Hugging Face

## Pipelines with Hugging Face

### Geting started with Pipelines

In [None]:
# Import pipeline
from transformers import pipeline

# Create the task pipeline
task_pipeline = pipeline(task="sentiment-analysis")

# Create the model pipeline
model_pipeline = pipeline(model="distilbert-base-uncased-finetuned-sst-2-english")

# Predict the sentiment
task_output = task_pipeline(input)
model_output = model_pipeline(input)

print(f"Sentiment from task_pipeline: {task_output[0]['label']}; Sentiment from model_pipeline: {model_output[0]['label']}")

### <script.py> output: Sentiment from task_pipeline: POSITIVE; Sentiment from model_pipeline: POSITIVE

### Using AutoClasses

In [None]:
# Download the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# Create the pipeline
sentimentAnalysis = pipeline(task="sentiment-analysis", model=model, tokenizer=tokenizer)

# Predict the sentiment
output = sentimentAnalysis(input)

print(f"Sentiment using AutoClasses: {output[0]['label']}")

### <script.py> output: Sentiment using AutoClasses: POSITIVE

### Comparing models with the pipeline

In [None]:
# Create the pipeline
distil_pipeline = pipeline(task="sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Predict the sentiment
distil_output = distil_pipeline(input)

In [None]:
# Create the pipeline
distil_pipeline = pipeline(task="sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Predict the sentiment
distil_output = distil_pipeline(input)

# Create the second pipeline and predict the sentiment
bert_pipeline = pipeline(task="sentiment-analysis", model="kwang123/bert-sentiment-analysis")
bert_output = bert_pipeline(input)

print(f"Bert Output: {bert_output[0]['label']}")
print(f"Distil Output: {distil_output[0]['label']}")

# <script.py> output: Bert Output: Extremely Positive     Distil Output: POSITIVE

## NLP and tokenization

### Normalizing text

In [None]:
# Import the AutoTokenizer
from transformers import AutoTokenizer

# Download the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Normalize the input string
output = tokenizer.backend_tokenizer.normalizer.normalize_str("HOWDY, how aré yoü?")

print(output)

# <script.py> output: howdy, how are you?

### Comparing tokenizer output

In [None]:
# Download the gpt tokenizer
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Tokenize the input
gpt_tokens = gpt_tokenizer.tokenize(text=input)

# Repeat for distilbert
distil_tokenizer = DistilBertTokenizer.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)
distil_tokens = distil_tokenizer.tokenize(text=input)

# Compare the output
print(f"GPT tokenizer: {gpt_tokens}")
print(f"DistilBERT tokenizer: {distil_tokens}")

# <script.py> output:
# GPT tokenizer: ['P', 'ine', 'apple', 'Ġon', 'Ġpizza', 'Ġis', 'Ġpretty', 'Ġgood', ',', 'ĠI', 'Ġguess', '.']
# DistilBERT tokenizer: ['pine', '##apple', 'on', 'pizza', 'is', 'pretty', 'good', ',', 'i', 'guess', '.']

## Text classification

### Grammatical correctness

In [None]:
# Create a pipeline
classifier = pipeline(
  task="text-classification", 
  model="abdulmatinomotoso/English_Grammar_Checker"
)

# Predict classification
output = classifier("I will walk dog")

print(output)

# <script.py> output: [{'label': 'LABEL_0', 'score': 0.9956323504447937}]

### Question Natural Language Inference

In [None]:
# Create the pipeline
classifier = pipeline(task="text-classification", model="cross-encoder/qnli-electra-base")

# Predict the output
output = classifier("Where is the capital of France?, Brittany is known for their kouign-amann.")

print(output)

# <script.py> output: [{'label': 'LABEL_0', 'score': 0.005238980986177921}]

### Zero-shot classification

In [None]:
# Build the zero-shot classifier
classifier = pipeline(task="zero-shot-classification", model="facebook/bart-large-mnli")

# Create the list
candidate_labels = ["politics", "science", "sports"]

# Predict the output
output = classifier(text, candidate_labels)

print(f"Top Label: {output['labels'][0]} with score: {output['scores'][0]}")

# <script.py> output: Top Label: science with score: 0.9030616879463196

## Summarization

### Summarizing long text

In [None]:
# Create the summarization pipeline
summarizer = pipeline(task="summarization", model="cnicu/t5-small-booksum")

# Summarize the text
summary_text = summarizer(original_text)

# Compare the length
print(f"Original text length: {len(original_text)}")
print(f"Summary length: {len(summary_text[0]['summary_text'])}")

# <script.py> output:
# Original text length: 829
# Summary length: 473

### Using min_length and max_length

In [None]:
# Create a short summarizer
short_summarizer = pipeline(task="summarization", model="cnicu/t5-small-booksum", min_length=1, max_length=10)

# Summarize the input text
short_summary_text = short_summarizer(original_text)

# Print the short summary
print(short_summary_text[0]["summary_text"])

# <script.py> output:
# Greece has many islands, with estimates ranging

In [None]:
# Create a short summarizer
short_summarizer = pipeline(task="summarization", model="cnicu/t5-small-booksum", min_length=1, max_length=10)

# Summarize the input text
short_summary_text = short_summarizer(original_text)

# Print the short summary
print(short_summary_text[0]["summary_text"])

# Repeat for a long summarizer
long_summarizer = pipeline(task="summarization", model="cnicu/t5-small-booksum", min_length=50, max_length=150)

long_summary_text = long_summarizer(original_text)

# Print the long summary
print(long_summary_text[0]["summary_text"])

""" Greece has many islands, with estimates ranging from somewhere around 1,200 to 6,000 depending on the minimum size 
to take into account. The number of inhabited islands is variously cited as between 166 and 227. The Greek islands are 
traditionally grouped into the following clusters: the Argo-Saronic Islands in the Saronic Gulf near Athens; the Cyclades, 
a large but dense collection occupying the central part of the Aegean Sea; the North Aegesan islands, an loose group """

### Summarizing several inputs

In [None]:
# Create the list
text_to_summarize = [w["text"] for w in wiki]

# Create the pipeline
summarizer = pipeline("summarization", model="cnicu/t5-small-booksum",min_length=20, max_length=50)

# Summarize each item in the list
summaries = summarizer(text_to_summarize[:3], truncation=True)

# Create for-loop to print each summary
for i in range(0,3):
  print(f"Summary {i+1}: {summaries[i]['summary_text']}")

""" Summary 1: The Serapeum of Saqqara was the ancient Egyptian burial place for sacred bulls of the Apis cult at Memphis. 
It was believed that the bulls were incarnations of the god Ptah,
    Summary 2: Sauda is a town in Rogaland county, Norway. The town, which is also the administrative centre of the municipality,
is located in a river valley at the northern end of the town centre. A large part of the
    Summary 3: Luis Miguel Aparecido Alves (born May 25, 1985), known as Gugu, is a Brazilian football player currently playing 
for Iraklis Psachna F.C. External links 1985 births Living people"""

# 3. Building Pipelines for Image and Audio

## Processing and classifying images

### Processing image data

In [None]:
# Create the numpy array
image_array = np.array(original_image)

# Crop the center of the image
cropped_image = image_transforms.center_crop(image=image_array, size=(200, 200))

imgplot = plt.imshow(cropped_image)
plt.show()

### Creating an image classifier

In [None]:
# Create the pipeline
image_classifier = pipeline(task="image-classification", 
            model="abhishek/autotrain_fashion_mnist_vit_base")

# Predict the class of the image
results = image_classifier(cropped_image)

# Print the results
print(results[0]["label"])

# Create the pipeline
image_classifier = pipeline(task="image-classification", 
            model="abhishek/autotrain_fashion_mnist_vit_base")

# Predict the class of the image
results = image_classifier(cropped_image)

# <script.py> output:  Pullover

## Question answering and multi-modal tasks

### Document question and answering

In [None]:
# Create the pipeline
dqa = pipeline(task="document-question-answering", model="naver-clova-ix/donut-base-finetuned-docvqa")

# Set the image and question
image = "document.png"
question = "Which meeting is this document about?"

# Get the answer
results = dqa(image=image, question=question)

print(results)

# <script.py> output: [{'score': 0.7789, 'answer': 'takeda global risk management forum', 'start': 2, 'end': 7}]

### Visual question and answering

In [None]:
# Create pipeline
vqa = pipeline(task="visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")

# Use image and question in vqa
results = vqa(image=image, question=question)

print(results)

""" <script.py> output:  
[{'score': 0.9795706272125244, 'answer': 'hat'}, {'score': 0.5232054591178894, 'answer': 'beanie'},
{'score': 0.24782036244869232, 'answer': 'cap'}, {'score': 0.1803695112466812, 'answer': 'sweater'}, 
{'score': 0.021539464592933655, 'answer': 'hoodie'}] """ 

## Audio classification

### Resampling audio files

In [None]:
# Save the old sampling rate
old_sampling_rate = audio_file[1]["audio"]["sampling_rate"]

# Resample the audio files
audio_file = audio_file.cast_column("audio", Audio(sampling_rate=16_000))

# Compare the old and new sampling rates
print("Old sampling rate:", old_sampling_rate)
print("New sampling rate:", audio_file[1]["audio"]["sampling_rate"])

# <script.py> output:

# Old sampling rate: 48000
# New sampling rate: 16000

### Filtering out audio files

In [None]:
# Create a list of durations
old_durations_list = []

# Loop over the dataset
for row in dataset["path"]:
    old_durations_list.append(librosa.get_duration(path=row))

# Create a new column
dataset = dataset.add_column("duration", old_durations_list)

In [None]:
# Create a list of durations
old_durations_list = []

# Loop over the dataset
for row in dataset["path"]:
    old_durations_list.append(librosa.get_duration(path=row))

# Create a new column
dataset = dataset.add_column("duration", old_durations_list)

# Filter the dataset
filtered_dataset = dataset.filter(lambda d: d < 6.0, input_columns=["duration"], keep_in_memory=True)

In [None]:
# Create a list of durations
old_durations_list = []

# Loop over the dataset
for row in dataset["path"]:
    old_durations_list.append(librosa.get_duration(path=row))

# Create a new column
dataset = dataset.add_column("duration", old_durations_list)

# Filter the dataset
filtered_dataset = dataset.filter(lambda d: d < 6.0, input_columns=["duration"], keep_in_memory=True)

# Save new durations
new_durations_list = filtered_dataset["duration"]

print("Old duration:", np.mean(old_durations_list)) 
print("New duration:", np.mean(new_durations_list))

"""
<script.py> output:
    Old duration: 4.8
    New duration: 3.3333333333333335
"""

### Classifying audio files

In [None]:
# Create the pipeline
classifier = pipeline(task="audio-classification", model="facebook/mms-lid-126")

# Extract the sample
audio = dataset[1]["audio"]["array"]
sentence = dataset[1]["sentence"]

# Predict the language
prediction = classifier(audio)

print(f"Predicted language is '{prediction[0]['label'].upper()}' for the sentence '{sentence}'")

# <script.py> output:
# Predicted language is 'DEU' for the sentence 'Deswegen ballert es mehr.'

## Automatic speech recognition

### Instantiating an ASR pipeline

In [None]:
# Create an ASR pipeline using Meta's wav2vec model
meta_asr = pipeline(task="automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

# Predict the text from the example audio
meta_pred = meta_asr(example["audio"]["array"])["text"].lower()

# Repeat for OpenAI's Whisper model
open_asr = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
open_pred = open_asr(example["audio"]["array"])["text"].lower()

# Print the prediction from both models
print("META:", meta_pred)
print("OPENAI:", open_pred)

"""
<script.py> output:
    META: it is a charity school whose feeds are calculated on a men test
    OPENAI:  it is a charity school whose fees are calculated on a means test.
"""

### Word error rate

In [None]:
# Create the word error rate metric
wer = load("wer")

# Save the true sentence of the example
true_sentence = example["sentence"].lower()

# Compute the wer for each model prediction
meta_wer = wer.compute(predictions=[meta_pred], references=[true_sentence])
open_wer = wer.compute(predictions=[open_pred], references=[true_sentence])

print(f"The WER for the Meta model is {meta_wer} and for the OpenAI model is {open_wer}")


"""
<script.py> output:
    The WER for the Meta model is 0.23076923076923078 and for the OpenAI model is 0.0
"""

### Iterating over a dataset

In [None]:
# Create the data function
def data(n=3):
    for i in range(n):
        yield english[i]["audio"]["array"], english[i]["sentence"].lower()
        
# Predict the text for the audio file with both models
output = []
for audio, sentence in data():
    meta_pred = meta_asr(audio)["text"].lower()
    open_pred = open_asr(audio)["text"].lower()
    # Append to output list
    output.append({"sentence": sentence, "metaPred": meta_pred, "openPred": open_pred})

output_df = pd.DataFrame(output)

In [None]:
# Create the data function
def data(n=3):
    for i in range(n):
        yield english[i]["audio"]["array"], english[i]["sentence"].lower()
        
# Predict the text for the audio file with both models
output = []
for audio, sentence in data():
    meta_pred = meta_asr(audio)["text"].lower()
    open_pred = open_asr(audio)["text"].lower()
    # Append to output list
    output.append({"sentence": sentence, "metaPred": meta_pred, "openPred": open_pred})

output_df = pd.DataFrame(output)

# Compute the WER for both models
metaWER = wer.compute(predictions=output_df["metaPred"], references=output_df["sentence"])
openWER = wer.compute(predictions=output_df["openPred"], references=output_df["sentence"])

# Print the WER
print(f"The WER for the meta model is {metaWER} and for the open model is {openWER}")

# <script.py> output:
# The WER for the meta model is 0.6097560975609756 and for the open model is 0.24390243902439024

# 4. Fine-tuning and Embeddings

## Fine-tuning a model

### Preparing a dataset

In [None]:
# Import modules
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

In [None]:
# Import modules
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Import modules
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Use tokenizer on text
dataset = dataset.map(lambda row: tokenizer(row["text"], padding=True, max_length=512, truncation=True), keep_in_memory=True)

### Building the trainer

In [None]:
# Create training arguments
training_args = TrainingArguments(output_dir="./results")

# Create the trainer
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=training_data, 
    eval_dataset=testing_data
)

# Start the trainer
trainer.train()

# Success!

### Using the fine-tuned model

In [None]:
# Create the classifier
classifier = pipeline(task="sentiment-analysis", model="./fine_tuned_model")

# Classify the text
results = classifier(text=text_example)

print(results)

# [{'label': 'POSITIVE', 'score': 0.9999}]

## Text generation

### Generating text from a text prompt

In [None]:
# Set model name
model_name = "gpt2"

# Get the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
# Set model name
model_name = "gpt2"

# Get the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

prompt = "Wear sunglasses when its sunny because"

# Tokenize the input
input_ids = tokenizer.encode(prompt, return_tensors="pt")

In [None]:
# Set model name
model_name = "gpt2"

# Get the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

prompt = "Wear sunglasses when its sunny because"

# Tokenize the input
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Generate the text output
output = model.generate(input_ids, num_return_sequences=1)

# Decode the output
generated_text = tokenizer.decode(output[0])

print(generated_text)

"""
    Wear sunglasses when its sunny because it's a hot day.
    
    The best way to get
"""

### Generating a caption for an image

In [None]:
# Get the processor and model
processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

# Process the image
pixels = processor(images=image, return_tensors="pt").pixel_values

# Generate the ids
output = model.generate(pixel_values=pixels)

# Decode the output
caption = processor.batch_decode(output)

print(caption[0])

# [CLS] a woman wearing a black sweater and grey sweatpants. [SEP]

## Embeddings

### Generate embeddings for a sentence

In [None]:
# Create the first embedding model
embedder1 = SentenceTransformer("all-MiniLM-L6-v2")

# Embed the sentence
embedding1 = embedder1.encode([sentence])

# Create and use second embedding model
embedder2 = SentenceTransformer("sentence-transformers/paraphrase-albert-small-v2")
embedding2 = embedder2.encode([sentence])
 
# Compare the shapes
print(embedding1.shape == embedding2.shape)

# Print embedding1
print(embedding1)

"""
 False
    [[-9.97783169e-02  2.52459422e-02 -3.83034647e-02 -3.18860300e-02
      -2.63425820e-02  2.63856594e-02  1.59228127e-02 -4.64465283e-03
       2.83517758e-03 -3.23289521e-02  6.51238114e-02  1.30132377e-01
       9.38770995e-02 -1.41172754e-02  5.23919286e-03  9.25582573e-02
      -1.23580517e-02 -4.03374620e-02 -7.99983889e-02 -6.15836121e-02
       3.31274867e-02 -5.38319498e-02 -3.77525352e-02  2.82219537e-02
       3.79624367e-02  3.03728953e-02  3.27130929e-02  4.29569818e-02
       5.17940819e-02  5.30882627e-02 -6.03116266e-02  2.60443687e-02
       3.02512143e-02  7.42835924e-02 -6.47228435e-02  5.04810400e-02
      -1.77761260e-02  7.10309371e-02 -1.43843796e-03 -1.53305486e-03
      -1.32317245e-01 -2.39030961e-02  2.89693419e-02  4.83148843e-02
       2.22859662e-02  4.36660927e-03 -3.29253897e-02 -3.66645530e-02
      -9.52791190e-04 -3.06845773e-02 -5.96390963e-02 -3.28561775e-02
      -3.68510298e-02  1.21275438e-02  6.58628717e-02  5.20405285e-02
       5.09343781e-02  4.50430065e-03  1.59408674e-02  2.60929926e-03
       3.09519451e-02  2.70236451e-02 -4.74882945e-02  4.79927212e-02
       4.69670035e-02  1.36781754e-02 -5.51799648e-02  5.68730496e-02
      -6.44658580e-02  5.22145629e-02 -5.31799383e-02  1.09241001e-01
      -3.63433966e-03  8.24928954e-02 -2.86205094e-02  7.33143762e-02
      -1.69356037e-02  5.79553805e-02 -5.36082871e-02 -6.04650266e-02
      -7.38946572e-02 -1.75970774e-02  1.68791804e-02  5.80657125e-02
       3.73408720e-02 -3.50533053e-02  7.05841109e-02  1.13283354e-03
       6.18120730e-02  1.58435311e-02  1.09596578e-02 -4.22663316e-02
       4.71505821e-02  2.85447277e-02 -2.13031620e-02  4.17851917e-02
       8.52201320e-03 -7.16882721e-02 -8.60826462e-04  1.62748601e-02
       2.61676311e-02  6.76399767e-02  1.81852039e-02  9.94604081e-03
       3.35503519e-02 -3.20622697e-02 -4.24551442e-02  2.48599797e-02
       3.91133275e-04 -6.94354475e-02 -1.30046555e-03  5.69762699e-02
      -1.68611649e-02 -4.50349152e-02 -1.57800727e-02 -3.37041132e-02
       1.21849012e-02  1.07976552e-02  8.05590078e-02 -5.08098640e-02
       9.08086747e-02  5.18481582e-02 -3.30031812e-02  6.11459576e-02
       3.17238481e-03 -8.13124925e-02 -6.41077235e-02 -5.65072220e-33
       5.49403429e-02 -4.77581918e-02 -5.90103120e-02  6.94325045e-02
      -4.50356901e-02  3.03159505e-02 -1.01291820e-01  3.36031988e-02
      -5.72547987e-02 -4.23307642e-02  3.79419141e-02 -1.04357138e-01
       5.42505607e-02  4.75572832e-02 -5.57749756e-02 -2.26175506e-02
       2.16718987e-02 -4.60104868e-02 -4.85511571e-02 -3.09738424e-02
       2.51442622e-02 -4.17656377e-02 -5.00752293e-02  7.94275105e-02
       2.81374343e-02  6.27281144e-02  2.36685146e-02 -5.19433506e-02
      -9.22212098e-03  2.62258593e-02 -3.77551988e-02 -3.53067890e-02
      -7.92817920e-02 -3.10328919e-02  2.87927911e-02  8.80100429e-02
       3.60820740e-02 -4.85621840e-02  5.01015335e-02 -1.73070673e-02
      -2.21038684e-02  4.00975794e-02 -3.99311185e-02 -7.84583949e-03
       3.78126353e-02 -2.85196006e-02 -1.76647175e-02  1.87056698e-02
       8.93504079e-03  1.75491534e-02 -1.15821296e-02  4.05076779e-02
       8.65147561e-02 -3.49141611e-03  8.90456419e-03 -7.44589046e-02
      -6.05441048e-04 -8.67986903e-02 -2.64925789e-02  3.04380972e-02
      -4.57907282e-02  6.03588810e-03  3.32466629e-03 -3.11117992e-02
       7.98699632e-03  8.80055130e-03 -1.57751311e-02  5.53784519e-02
       1.39530167e-01 -8.32842961e-02 -5.11823185e-02  6.75496981e-02
      -2.05617603e-02  3.17197293e-02 -5.99950403e-02 -7.21503934e-03
      -7.75092095e-02  1.94495786e-02 -1.57112237e-02 -1.00682855e-01
       5.32578863e-02  3.68943587e-02 -1.50739886e-02  3.64669710e-02
      -1.66182453e-03 -1.37150940e-02  6.17486611e-02 -3.34757157e-02
      -1.42693585e-02 -2.81503820e-03 -2.68557435e-03 -1.34696867e-02
       8.08540210e-02 -1.67186167e-02 -4.53580171e-02  3.81941771e-33
       3.51107158e-02 -7.32830586e-03  1.55316852e-03 -7.34632388e-02
      -8.12715441e-02  3.36477719e-02  4.78136800e-02 -6.79817945e-02
       4.49076332e-02  3.69781815e-02 -1.04608918e-02 -2.66188495e-02
      -7.78917745e-02 -5.20985723e-02  4.12008911e-02  6.24233633e-02
      -1.02880567e-01 -8.10468849e-03 -6.27486128e-03 -2.78272890e-02
      -7.65276551e-02 -4.75244112e-02  5.51298968e-02  1.02863833e-02
       2.45338436e-02  2.49001980e-02  1.16799213e-01  1.02445275e-01
      -4.78395782e-02 -6.11801185e-02  4.37457450e-02 -3.72903571e-02
      -4.15750444e-02 -9.71988365e-02 -1.97156565e-03  3.29912454e-02
      -2.13280264e-02 -1.02769576e-01  3.61079946e-02 -2.64145657e-02
       9.04297605e-02  3.54486369e-02 -2.91568544e-02  1.03625089e-01
      -9.41040218e-02 -5.67003898e-02  3.77397910e-02 -4.47377227e-02
       7.18963612e-03 -3.55827287e-02 -1.89791229e-02 -9.02059227e-02
       1.89802162e-02 -5.60935773e-02  2.47590560e-02  1.01045854e-02
       8.14194828e-02  6.65857196e-02  4.16135006e-02 -3.90387699e-02
      -3.44039383e-03  3.06885336e-02  6.70636967e-02 -8.55752304e-02
      -3.35187986e-02 -5.26400506e-02 -2.06645206e-02  3.60688232e-02
       2.78380476e-02 -6.59141466e-02  3.60011384e-02  1.97536927e-02
      -1.11820258e-01 -2.30906461e-03  1.15684690e-02  1.65773823e-03
       1.37196228e-01  3.28815193e-03 -7.74928108e-02  2.85647027e-02
      -6.02809153e-02  2.37368862e-03 -1.43561754e-02  7.00630173e-02
      -5.43874614e-02 -8.03274885e-02  2.93609612e-02  1.94245726e-02
      -2.18332093e-02 -1.22052524e-02 -7.11195022e-02  6.99475408e-02
       1.22621581e-01 -5.81212305e-02  2.11172048e-02 -1.87510594e-08
      -3.00814621e-02 -8.34934879e-03  5.99007457e-02  4.17810455e-02
       9.00554005e-03  5.20310998e-02  8.32400378e-03  2.90580355e-02
      -1.17555670e-01 -2.68037505e-02  1.09415941e-01  7.31720030e-02
      -1.81163605e-02 -9.15972400e-04  1.19827932e-03 -4.20594066e-02
       6.66432232e-02 -9.15124714e-02 -3.80621478e-02  1.68342795e-02
      -5.94501756e-03  6.94455504e-02 -2.29016561e-02  5.60755245e-02
       4.38208692e-02  2.39182962e-03  8.04815143e-02  6.93722144e-02
      -5.77484295e-02 -1.89536773e-02  2.46403869e-02  9.27565023e-02
      -2.64064949e-02 -4.52436656e-02  8.95928312e-03 -3.07379849e-02
       1.30720526e-01  2.94232219e-02  5.37222512e-02 -2.19165273e-02
      -2.31754389e-02 -4.73286919e-02 -2.98651028e-02  4.21220288e-02
       5.72138652e-02 -3.41255691e-05  4.10156511e-02 -6.76238164e-02
      -4.65301275e-02 -1.32226139e-01 -1.01639435e-01  4.56268806e-03
      -9.53335315e-03  2.67508160e-02 -2.86023458e-03 -9.76563618e-03
       5.95268910e-04 -3.51175256e-02  4.04534815e-03  6.09354451e-02
       5.79103269e-03  1.06247805e-01  4.81397398e-02 -1.53533667e-02]]
"""

## Semantic search

### Using semantic search

In [None]:
query = "I need a desktop book reader for Mac"

# Generate embeddings
query_embedding = embedder.encode([query])[0]

# Compare embeddings
hits = util.semantic_search(query_embedding, sentence_embeddings, top_k=2)

# Print the top results
for hit in hits[0]:
    print(sentences[hit["corpus_id"]], "(Score: {:.4f})".format(hit["score"]))
    
"""
Can anyone suggest a desktop book reader for Mac that works similar to Stanza on the iPhone? (Score: 0.8011)
I'm looking for a good quality headset that doesn't cost too much. Any recommendations? (Score: 0.1437)
"""