
# TextToEmbedding Pipeline Demo

## This notebook demonstrates how to use the TextToEmbedding pipeline to convert text data into embeddings using the SONAR model.

### Setup
First, let's import the necessary libraries and set up logging.


In [1]:
import logging
from datasets import load_dataset
from huggingface_pipelines.text import (
    HFTextToEmbeddingPipeline, HFEmbeddingToTextPipeline,
    TextToEmbeddingPipelineConfig, EmbeddingToTextPipelineConfig
)
from huggingface_pipelines.dataset import DatasetOverwrites, DatasetConfig
from huggingface_pipelines.metric_analyzer import MetricAnalyzerPipeline
import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


# Load Dataset
## We'll use the AG News dataset for this demonstration.



In [2]:

dataset_config = DatasetConfig(
        dataset_name="ag_news",
        dataset_split="test"
    )
dataset = dataset_config.load_dataset()

print(dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 7600
})


## Configure Pipeline
### Now, let's set up the TextToEmbedding pipeline configuration.


In [3]:
config = TextToEmbeddingPipelineConfig(
    columns=["text"]
).with_overwrites({
    "encoder_model": "text_sonar_basic_encoder",
    "source_lang": "eng_Latn",
    "output_file_name": "ag_news_results",
    "take": 1  # Process 100 samples
})


pipeline = HFTextToEmbeddingPipeline(config)
dataset = pipeline(dataset)

print(dataset[:5])




INFO:huggingface_pipelines.text:Initializing text to embedding model...
INFO:huggingface_pipelines.text:Initializing spaCy model for language: eng_Latn
INFO:huggingface_pipelines.text:Models initialized.
INFO:huggingface_pipelines.pipeline:Starting to process dataset...
Map:   0%|          | 0/5 [00:00<?, ? examples/s]INFO:huggingface_pipelines.text:Sample of text_sentences: [["Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul."], ['The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\\team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for\\privately funded suborbital space flight, has officially announced the first\\launch date for its manned rocket.'], ['Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisv

{'text': ["Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.", 'The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\\team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for\\privately funded suborbital space flight, has officially announced the first\\launch date for its manned rocket.', 'Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins.', "Prediction Unit Helps Forecast Wildfires (AP) AP - It's barely dawn when Mike Fitzpatrick starts his shift with a blur of colorful maps, figures and endless charts, but already he knows what the day will bring. Lightning will strike




# We'll use the dataset with embeddings from the previous TextToEmbedding pipeline.


In [8]:
config = EmbeddingToTextPipelineConfig(
    columns=["text"],
    output_dir="results"
).with_overwrites({
    "decoder_model": "text_sonar_basic_decoder",
    "target_lang": "eng_Latn",

    "output_file_name": "ag_news_results",
    "take": 1
})

## Initialize and Run Pipeline

In [9]:
pipeline = HFEmbeddingToTextPipeline(config)
dataset = pipeline(dataset)


INFO:huggingface_pipelines.text:Initializing embedding to text model...
INFO:huggingface_pipelines.text:Model initialized.
INFO:huggingface_pipelines.pipeline:Starting to process dataset...
Map:   0%|          | 0/5 [00:00<?, ? examples/s]INFO:huggingface_pipelines.text:Embeddings: [[[0.00048442441038787365, 0.006896345876157284, 0.0046835229732096195, -0.003770195646211505, 0.0014335550367832184, -0.0037776129320263863, -0.00907781533896923, 0.002556778956204653, 0.0034300058614462614, -0.006269044242799282, 2.9267645004438236e-05, -0.005167630035430193, -0.002798454836010933, -0.008851427584886551, -0.0063611348159611225, -0.009358249604701996, 0.004702939186245203, 0.0059711565263569355, 0.007850674912333488, -0.012632885947823524, 0.014553138986229897, 0.004772466141730547, 0.01385858841240406, 0.006296239327639341, -0.0027604468632489443, 0.00026610083295963705, -0.0047318520955741405, -0.0033269431442022324, 0.004268632270395756, -0.008269382640719414, 0.0012290494050830603, -0.0

## Display Results

In [None]:
for original, reconstructed in zip(dataset['text'][:5], dataset['text_reconstructed'][:5]):
    print(f"Original: {original}")
    print(f"Reconstructed: {reconstructed}")
    print()

## Analyze Reconstruction Quality
### Let's compare the embeddings of original and reconstructed texts.

In [None]:
metric_analyzer_config = MetricPipelineConfig(
    columns=["text"]
).with_overwrites({
    "metric_name": "bleu",
    "low_score_threshold": 0.5,
    "output_file_name": "ag_news_results",
    "take": 1
})

pipeline = MetricAnalyzerPipeline(metric_analyzer_config)

dataset = pipeline(dataset)

plt.figure(figsize=(10, 6))
sns.histplot(dataset['bleu_score'], kde=True)
plt.title('Distribution of BLEU Scores')
plt.xlabel('BLEU Score')
plt.ylabel('Frequency')
plt.axvline(x=config.low_score_threshold, color='r', linestyle='--', label='Low Score Threshold')
plt.legend()
plt.show()


low_scoring = dataset.filter(lambda x: x['bleu_score'] < config.low_score_threshold)
print(f"Number of low-scoring samples: {len(low_scoring)}")

for sample in low_scoring[:5]:
    print(f"Original: {sample['text']}")
    print(f"Reconstructed: {sample['text_reconstructed']}")
    print(f"BLEU Score: {sample['bleu_score']}")
    print()
