In [0]:
import json
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from transformers import RobertaTokenizer

# Configuración de Azure Blob Storage
storage_account_name = "*************"
container_name = "*************"
storage_account_key = "*************"

# Crear BlobServiceClient
blob_service_client = BlobServiceClient(account_url=f"https://{storage_account_name}.blob.core.windows.net", credential=storage_account_key)

# Descargar el archivo de artículos del Blob Storage
try:
    blob_client = blob_service_client.get_blob_client(container=container_name, blob="articles.json")
    articles_data = json.loads(blob_client.download_blob().readall())
except Exception as e:
    print(f"Error al descargar el archivo de artículos: {e}")
    raise

# Inicializar el tokenizador de Roberta Large CA
tokenizer = RobertaTokenizer.from_pretrained('projecte-aina/roberta-large-ca-v2')

# Función para dividir el texto en fragmentos
def split_text(text, max_length=512):
    tokens = tokenizer.tokenize(text)
    return [' '.join(tokens[i:i + max_length]) for i in range(0, len(tokens), max_length)]

# Tokenizar las noticias
def tokenize_articles(articles):
    tokenized_articles = []
    for article in articles:
        text_fragments = split_text(article['text'])
        for idx, fragment in enumerate(text_fragments):
            tokens = tokenizer(
                fragment, 
                padding='max_length', 
                truncation=True, 
                max_length=512,  # Máximo de tokens para RoBERTa
                return_tensors='pt'
            )
            tokenized_articles.append({
                'contentId': article['contentId'],  # Mantener contentId original
                'content_token_id': f"{article['contentId']}_{idx}",  # Identificar fragmentos
                'input_ids': tokens['input_ids'].tolist(),
                'attention_mask': tokens['attention_mask'].tolist(),
                'topicName': article['topicName']
            })
    return tokenized_articles

try:
    tokenized_articles = tokenize_articles(articles_data)
    # Convertir a JSON
    tokenized_articles_json = json.dumps(tokenized_articles, ensure_ascii=False, indent=4)
except Exception as e:
    print(f"Error durante la tokenización: {e}")
    raise

# Crear BlobClient para los tokens
try:
    blob_client = blob_service_client.get_blob_client(container=container_name, blob="articles_tokens_roberta_CA.json")
    # Subir los tokens al Blob Storage
    blob_client.upload_blob(tokenized_articles_json, blob_type="BlockBlob", overwrite=True)
    print(f"Tokens de noticias generados y guardados en Azure Blob Storage. Total de fragmentos tokenizados: {len(tokenized_articles)}")
except Exception as e:
    print(f"Error al subir los tokens al Blob Storage: {e}")
    raise
