In [None]:
import torch

args = {
    'model_path': 'grazh/llama-3.1-8b-clinical-es-inst-v3',
    'task': 'text-generation',
    'torch_dtype': torch.bfloat16,
    'device_map': 'auto',
    'do_sample': True,
    'temperature': 0.1,
    'max_new_tokens': 700,
    'num_gpus': 1,
    'load_8bit': False,
    'cpu_offloading': False,
    'max_gpu_memory': None,
    'revision': 'main',
    'debug': False,
    'top_p': 1,
    'full_texts_dir': '/content/drive/MyDrive/AI/MultiClinSum 2025/Data/test/multiclinsum_test_es',
    'output_dir': '/content/drive/MyDrive/AI/MultiClinSum 2025/Results/LlamaGRPO_v3_test_set'
}

class DotDict:
    def __init__(self, dictionary):
        self._data = dictionary

    def __getattr__(self, key):
        if key in self._data:
            return self._data[key]
        else:
            raise AttributeError(f"'DotDict' object has no attribute '{key}'")

    def __setattr__(self, key, value):
        if key == '_data':
            super().__setattr__(key, value)
        else:
            self._data[key] = value

    def __delattr__(self, key):
        if key in self._data:
            del self._data[key]
        else:
            raise AttributeError(f"'DotDict' object has no attribute '{key}'")

args = DotDict(args)

In [None]:
from transformers import pipeline

summarization_pipeline = pipeline(
    args.task,
    model=args.model_path,
    model_kwargs={"torch_dtype": args.torch_dtype},
    device_map=args.device_map,
    max_new_tokens=args.max_new_tokens,
    do_sample=args.do_sample,
    temperature=args.temperature,
    top_p=args.top_p
)

In [None]:
def extract_xml_summary(text: str) -> str:
    answer = text.split("<summary>")[-1]
    answer = answer.split("</summary>")[0]
    return answer.strip()

In [None]:
SYSTEM_PROMPT = """
Eres el asistente de inteligencia artificial de un médico. Se te proporciona el texto completo de un informe de caso clínico en español. Tu tarea es resumir el informe de caso clínico en español.
Para ello, primero identifica y extrae las partes más importantes del informe completo, tales como la información clínica relevante, el diagnóstico, las intervenciones, los desenlaces y otros aspectos fundamentales del caso. Basándote en esta información, crea un resumen claro, preciso y coherente que incluya toda la información relevante del informe.
Escribe el resumen final entre las etiquetas <summary> y </summary>.

Respuesta:
<summary>...</summary>
"""

def get_summary(text_to_summarize):
  messages = [
      {'role': 'system', 'content': SYSTEM_PROMPT},
      {'role': 'user', 'content': f"Por favor, resume el siguiente informe de caso clínico en español.\n{text_to_summarize}"}
  ]

  full_response = summarization_pipeline(messages)[0]["generated_text"][-1]["content"]
  summary = extract_xml_summary(full_response)
  return summary

In [None]:
def get_output_file_name(full_text_file_name):
  return f"{full_text_file_name.removesuffix('.txt')}_sum.txt"

In [None]:
import os
from tqdm.notebook import tqdm

done = list(os.listdir(args.output_dir))

full_text_file_names_sorted = sorted(
    [f for f in os.listdir(args.full_texts_dir) if f.endswith(".txt")],
    key=lambda x: int(x.split('_')[-2].removesuffix('.txt'))
)

print("File names", full_text_file_names_sorted)

for file_name in tqdm(full_text_file_names_sorted):
  output_file_name = get_output_file_name(file_name)
  if output_file_name in done:
    print("Skipping", file_name)
    continue

  print("Summarizing", file_name, "with output", output_file_name)
  with open(f"{args.full_texts_dir}/{file_name}", "r", encoding='utf8') as in_file:
    with open(f"{args.output_dir}/{output_file_name}", "w", encoding='utf8') as out_file:
      out_file.write(get_summary(in_file.read()))

In [None]:
from google.colab import runtime
runtime.unassign()