In [0]:
import json
from sentence_transformers import SentenceTransformer
from azure.storage.blob import BlobServiceClient

# Conectar al Blob Storage
blob_service_client = BlobServiceClient.from_connection_string("***************************************")
container_name = "*****"
input_blob_name = "articles.json"
embeddings_blob_name = "articles_embeddings_sbert_cls_ALL_roberta.json"
input_blob_client = blob_service_client.get_blob_client(container_name, input_blob_name)
embeddings_blob_client = blob_service_client.get_blob_client(container_name, embeddings_blob_name)

# Descargar el archivo JSON
try:
    downloaded_blob = input_blob_client.download_blob().readall()
    articles_data = json.loads(downloaded_blob)
    print(f"Downloaded {len(articles_data)} articles.")
except Exception as e:
    print(f"Error al descargar el archivo JSON: {e}")
    raise

# Cargar el modelo all-roberta-large-v1
model_name = "all-roberta-large-v1"
model = SentenceTransformer(model_name)

# Función para generar embeddings [CLS]
def generate_cls_embedding(text):
    embedding = model.encode(text, convert_to_tensor=True)
    return embedding.cpu().tolist()

# Procesar todos los artículos y generar embeddings
embeddings_data = []

for idx, article in enumerate(articles_data):
    try:
        content_id = article['contentId']
        text = article['text']  
        embeddings = generate_cls_embedding(text)
        embeddings_data.append({
            'contentId': content_id,
            'embeddings': embeddings
        })

        if idx % 100 == 0:
            print(f"Processed {idx + 1} articles")

    except Exception as e:
        print(f"Error al procesar el artículo {content_id}: {e}")

# Guardar los embeddings generados en un archivo JSON
try:
    embeddings_json = json.dumps(embeddings_data, ensure_ascii=False, indent=4)
    embeddings_blob_client.upload_blob(embeddings_json, overwrite=True)
    print(f"Uploaded embeddings for {len(embeddings_data)} articles to {embeddings_blob_name}.")
except Exception as e:
    print(f"Error al subir los embeddings al Blob Storage: {e}")
    raise
