In [0]:
import json
import torch
from transformers import RobertaModel
from azure.storage.blob import BlobServiceClient

#Embeddings de los fragmentos de Tokens

# Conectar al Blob Storage
blob_service_client = BlobServiceClient.from_connection_string("*************")
container_name = "*************"
input_blob_name = "articles_tokens_roberta.json"
embeddings_blob_name = "articles_fragments_embeddings_roberta.json"
input_blob_client = blob_service_client.get_blob_client(container_name, input_blob_name)
embeddings_blob_client = blob_service_client.get_blob_client(container_name, embeddings_blob_name)

# Descargar el archivo JSON
try:
    downloaded_blob = input_blob_client.download_blob().readall()
    tokenized_articles = json.loads(downloaded_blob)
    print(f"Downloaded {len(tokenized_articles)} tokenized articles.")
except Exception as e:
    print(f"Error al descargar el archivo JSON: {e}")
    raise

# Cargar modelo y tokenizer
model_name = "roberta-large"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RobertaModel.from_pretrained(model_name).to(device)

# Función para generar embeddings
def generate_embeddings(tokens):
    input_ids = torch.tensor(tokens['input_ids'], dtype=torch.long).to(device)
    attention_mask = torch.tensor(tokens['attention_mask'], dtype=torch.long).to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    last_hidden_state = outputs.last_hidden_state
    return last_hidden_state.mean(dim=1).squeeze(0).tolist()

# Procesar todos los artículos y generar embeddings
embeddings_data = []

for idx, article in enumerate(tokenized_articles):
    try:
        content_id = article['contentId']
        content_token_id = article['content_token_id']
        tokens = {'input_ids': article['input_ids'], 'attention_mask': article['attention_mask']}
        embeddings = generate_embeddings(tokens)
        embeddings_data.append({
            'contentId': content_id,
            'content_token_id': content_token_id,
            'embeddings': embeddings
        })

        if idx % 100 == 0:
            print(f"Processed {idx + 1} articles")

        # Liberar memoria de GPU si es necesario
        torch.cuda.empty_cache()

    except Exception as e:
        print(f"Error al procesar el artículo {content_id}: {e}")

# Guardar los embeddings generados en un archivo JSON
try:
    embeddings_json = json.dumps(embeddings_data)
    embeddings_blob_client.upload_blob(embeddings_json, overwrite=True)
    print(f"Uploaded embeddings for {len(embeddings_data)} fragments to {embeddings_blob_name}.")
except Exception as e:
    print(f"Error al subir los embeddings al Blob Storage: {e}")
    raise


In [0]:
import json
import torch
from azure.storage.blob import BlobServiceClient

# Técnica MEAN Pooling

# Conectar al Blob Storage
blob_service_client = BlobServiceClient.from_connection_string("***************")
container_name = "***************"
embeddings_blob_name = "articles_fragments_embeddings_roberta.json"
output_blob_name = "articles_embeddings_meanpooled_roberta.json"  # Cambiar el nombre del archivo de salida
embeddings_blob_client = blob_service_client.get_blob_client(container_name, embeddings_blob_name)
output_blob_client = blob_service_client.get_blob_client(container_name, output_blob_name)

# Descargar el archivo JSON de embeddings
try:
    downloaded_blob = embeddings_blob_client.download_blob().readall()
    embeddings_data = json.loads(downloaded_blob)
    print(f"Downloaded {len(embeddings_data)} embeddings fragments.")
except Exception as e:
    print(f"Error al descargar el archivo JSON: {e}")
    raise

# Agrupar embeddings de fragmentos del mismo artículo
article_embeddings = {}

for item in embeddings_data:
    content_id = item['contentId']
    embeddings = item['embeddings']
    
    if content_id in article_embeddings:
        article_embeddings[content_id].append(embeddings)
    else:
        article_embeddings[content_id] = [embeddings]

meanpooled_embeddings_data = []

for content_id, embeddings_list in article_embeddings.items():
    # Convertir lista de embeddings a tensor
    embeddings_tensor = torch.tensor(embeddings_list)
    # Aplicar mean pooling a lo largo de la primera dimensión
    meanpooled_embeddings = torch.mean(embeddings_tensor, dim=0).tolist()
    meanpooled_embeddings_data.append({
        'contentId': content_id,
        'embeddings': meanpooled_embeddings
    })

# Guardar los embeddings meanpooled en un archivo JSON
try:
    meanpooled_embeddings_json = json.dumps(meanpooled_embeddings_data)
    output_blob_client.upload_blob(meanpooled_embeddings_json, blob_type="BlockBlob", overwrite=True)
    print(f"Uploaded meanpooled embeddings for {len(meanpooled_embeddings_data)} articles to {output_blob_name}.")
except Exception as e:
    print(f"Error al subir los embeddings meanpooled al Blob Storage: {e}")
    raise


In [0]:
import json
import torch
from azure.storage.blob import BlobServiceClient

# Técnica MAX Pooling

# Conectar al Blob Storage
blob_service_client = BlobServiceClient.from_connection_string("*****************")
container_name = "*****************"
embeddings_blob_name = "articles_fragments_embeddings_roberta.json"
output_blob_name = "articles_embeddings_maxpooled_roberta.json"
embeddings_blob_client = blob_service_client.get_blob_client(container_name, embeddings_blob_name)
output_blob_client = blob_service_client.get_blob_client(container_name, output_blob_name)

# Descargar el archivo JSON de embeddings
try:
    downloaded_blob = embeddings_blob_client.download_blob().readall()
    embeddings_data = json.loads(downloaded_blob)
    print(f"Downloaded {len(embeddings_data)} embeddings fragments.")
except Exception as e:
    print(f"Error al descargar el archivo JSON: {e}")
    raise

# Agrupar embeddings de fragmentos del mismo artículo
article_embeddings = {}

for item in embeddings_data:
    content_id = item['contentId']
    embeddings = item['embeddings']
    
    if content_id in article_embeddings:
        article_embeddings[content_id].append(embeddings)
    else:
        article_embeddings[content_id] = [embeddings]

maxpooled_embeddings_data = []

for content_id, embeddings_list in article_embeddings.items():
    # Convertir lista de embeddings a tensor
    embeddings_tensor = torch.tensor(embeddings_list)
    # Aplicar max pooling a lo largo de la primera dimensión
    maxpooled_embeddings = torch.max(embeddings_tensor, dim=0).values.tolist()
    maxpooled_embeddings_data.append({
        'contentId': content_id,
        'embeddings': maxpooled_embeddings
    })

# Guardar los embeddings maxpooled en un archivo JSON
try:
    maxpooled_embeddings_json = json.dumps(maxpooled_embeddings_data)
    output_blob_client.upload_blob(maxpooled_embeddings_json, blob_type="BlockBlob", overwrite=True)
    print(f"Uploaded maxpooled embeddings for {len(maxpooled_embeddings_data)} articles to {output_blob_name}.")
except Exception as e:
    print(f"Error al subir los embeddings maxpooled al Blob Storage: {e}")
    raise


In [0]:
import json
import torch
from azure.storage.blob import BlobServiceClient

# Técnica Concatenación

# Conectar al Blob Storage
blob_service_client = BlobServiceClient.from_connection_string("*************")
container_name = "*************"
embeddings_blob_name = "articles_fragments_embeddings_roberta.json"
output_blob_name = "articles_embeddings_concatenated_roberta.json"
embeddings_blob_client = blob_service_client.get_blob_client(container_name, embeddings_blob_name)
output_blob_client = blob_service_client.get_blob_client(container_name, output_blob_name)

# Descargar el archivo JSON de embeddings
try:
    downloaded_blob = embeddings_blob_client.download_blob().readall()
    embeddings_data = json.loads(downloaded_blob)
    print(f"Downloaded {len(embeddings_data)} embeddings fragments.")
except Exception as e:
    print(f"Error al descargar el archivo JSON: {e}")
    raise

# Agrupar embeddings de fragmentos del mismo artículo
article_embeddings = {}

for item in embeddings_data:
    content_id = item['contentId']
    content_token_id = item['content_token_id']
    embeddings = item['embeddings']
    
    if content_id in article_embeddings:
        article_embeddings[content_id].append((content_token_id, embeddings))
    else:
        article_embeddings[content_id] = [(content_token_id, embeddings)]

concatenated_embeddings_data = []

for content_id, embeddings_list in article_embeddings.items():
    # Ordenar los embeddings por content_token_id
    embeddings_list.sort(key=lambda x: x[0])
    embeddings = [embedding for _, embedding in embeddings_list]
    
    # Convertir lista de embeddings a tensor
    embeddings_tensor = torch.tensor(embeddings)
    
    # Verificar la forma del tensor antes de concatenar
    if embeddings_tensor.ndimension() != 2:
        print(f"Warning: Embeddings for contentId {content_id} have unexpected dimensions: {embeddings_tensor.shape}")
    
    # Concatenar los embeddings a lo largo de la primera dimensión
    concatenated_embeddings = embeddings_tensor.view(-1).tolist()
    concatenated_embeddings_data.append({
        'contentId': content_id,
        'embeddings': concatenated_embeddings
    })

# Guardar los embeddings concatenados en un archivo JSON
try:
    concatenated_embeddings_json = json.dumps(concatenated_embeddings_data)
    output_blob_client.upload_blob(concatenated_embeddings_json, blob_type="BlockBlob", overwrite=True)
    print(f"Uploaded concatenated embeddings for {len(concatenated_embeddings_data)} articles to {output_blob_name}.")
except Exception as e:
    print(f"Error al subir los embeddings concatenated al Blob Storage: {e}")
    raise


In [0]:
import json
from azure.storage.blob import BlobServiceClient

# Técnica First [CLS]

# Conectar al Blob Storage
blob_service_client = BlobServiceClient.from_connection_string("****************")
container_name = "****************"
embeddings_blob_name = "articles_fragments_embeddings_roberta.json"
output_blob_name = "articles_first_cls_embeddings_roberta.json"
embeddings_blob_client = blob_service_client.get_blob_client(container=container_name, blob=embeddings_blob_name)
output_blob_client = blob_service_client.get_blob_client(container=container_name, blob=output_blob_name)

# Descargar el archivo JSON de embeddings
try:
    downloaded_blob = embeddings_blob_client.download_blob().readall()
    embeddings_data = json.loads(downloaded_blob)
    print(f"Downloaded {len(embeddings_data)} embeddings fragments.")
except Exception as e:
    print(f"Error al descargar el archivo JSON: {e}")
    raise

# Agrupar embeddings de fragmentos del mismo artículo
article_embeddings = {}

for item in embeddings_data:
    content_id = item['contentId']
    content_token_id = item['content_token_id']
    embeddings = item['embeddings']
    
    if content_id in article_embeddings:
        article_embeddings[content_id].append((content_token_id, embeddings))
    else:
        article_embeddings[content_id] = [(content_token_id, embeddings)]

first_cls_embeddings_data = []

for content_id, embeddings_list in article_embeddings.items():
    # Ordenar los embeddings por content_token_id
    embeddings_list.sort(key=lambda x: x[0])
    first_cls_embedding = embeddings_list[0][1]  # Aquí tomamos el embedding del primer fragmento
    first_cls_embeddings_data.append({
        'contentId': content_id,
        'embedding': first_cls_embedding
    })

# Guardar los primeros embeddings [CLS] en un archivo JSON
try:
    first_cls_embeddings_json = json.dumps(first_cls_embeddings_data)
    output_blob_client.upload_blob(first_cls_embeddings_json, blob_type="BlockBlob", overwrite=True)
    print(f"Uploaded first CLS embeddings for {len(first_cls_embeddings_data)} articles to {output_blob_name}.")
except Exception as e:
    print(f"Error al subir los embeddings al Blob Storage: {e}")
    raise


In [0]:
import json
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer, models
import torch

# Técnica Sentence-BERT

# Configuración de Azure Blob Storage
storage_account_name = "*************"
container_name = "*************"
storage_account_key = "*************"

# Crear BlobServiceClient
blob_service_client = BlobServiceClient(account_url=f"https://{storage_account_name}.blob.core.windows.net", credential=storage_account_key)

# Descargar el archivo de artículos del Blob Storage
try:
    blob_client = blob_service_client.get_blob_client(container=container_name, blob="articles.json")
    articles_data = json.loads(blob_client.download_blob().readall())
except Exception as e:
    print(f"Error al descargar el archivo de artículos: {e}")
    raise

# Inicializar el modelo de SentenceTransformer
model_name = "roberta-large" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
sentence_transformer_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Función para dividir el texto en fragmentos 
def split_text(text, max_length=4096):
    tokens = text.split()
    return [' '.join(tokens[i:i + max_length]) for i in range(0, len(tokens), max_length)]

# Generar embeddings para las noticias
def generate_embeddings(articles):
    embeddings = []
    needs_split_count = 0
    no_split_count = 0
    
    for article in articles:
        text_fragments = split_text(article['text'])
        
        if len(text_fragments) > 1:
            needs_split_count += 1
        else:
            no_split_count += 1
            
        for idx, fragment in enumerate(text_fragments):
            embedding = sentence_transformer_model.encode(fragment, convert_to_tensor=True)
            embeddings.append({
                'contentId': article['contentId'],  # Mantener contentId original
                'content_token_id': f"{article['contentId']}_{idx}",  # Identificar fragmentos
                'embedding': embedding.cpu().tolist(),  # Convertir el tensor a lista
                'topicName': article['topicName']
            })
    
    print(f"Textos que necesitan ser divididos: {needs_split_count}")
    print(f"Textos que no necesitan ser divididos: {no_split_count}")
    
    return embeddings

try:
    embeddings = generate_embeddings(articles_data)
    # Convertir a JSON
    embeddings_json = json.dumps(embeddings, ensure_ascii=False, indent=4)
except Exception as e:
    print(f"Error durante la generación de embeddings: {e}")
    raise

# Crear BlobClient para los embeddings
try:
    blob_client = blob_service_client.get_blob_client(container=container_name, blob="articles_embeddings_sbert_roberta.json")
    # Subir los embeddings al Blob Storage
    blob_client.upload_blob(embeddings_json, blob_type="BlockBlob", overwrite=True)
    print(f"Embeddings de noticias generados y guardados en Azure Blob Storage. Total de fragmentos procesados: {len(embeddings)}")
except Exception as e:
    print(f"Error al subir los embeddings al Blob Storage: {e}")
    raise
