In [1]:
# 1) Instalar dependencias
!pip install -U transformers accelerate bitsandbytes
!pip install huggingface_hub

Collecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-1.9.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accele

In [2]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm

In [None]:
def generar_historiasPorLote(
    json_input_file, 
    json_output_file, 
    prompt_base,
    model_name="deepseek-ai/deepseek-llm-7b-chat",
    batch_size=4,       # Cambia este valor según tu memoria GPU
    max_new_tokens=200
):
    # Cargar modelo y tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )

    # Leer issues
    with open(json_input_file, "r", encoding="utf-8") as f:
        issues = json.load(f)

    all_prompts = []
    index_map = []

    # Preparar todos los prompts
    for idx, issue in enumerate(issues):
        title = issue.get("title", "")
        body = issue.get("body", "")
        comments = "\n".join(c.get("body", "") for c in issue.get("all_comments", []))
        contenido = f"\nTítulo: {title}\nDescripción:\n{body}\nComentarios:\n{comments}"
        full_prompt = prompt_base + contenido
        all_prompts.append(full_prompt)
        index_map.append(idx)

    # Procesar en batches
    outputs_text = [""] * len(issues)

    for i in tqdm(range(0, len(all_prompts), batch_size)):
        batch_prompts = all_prompts[i:i+batch_size]

        # Tokenizar en batch
        inputs = tokenizer(
            batch_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=2048
        ).to("cuda")

        # Generar
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.3
        )

        # Decodificar
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Guardar en la lista
        for j, text in enumerate(decoded_outputs):
            outputs_text[i + j] = text

    # Asignar las historias a cada issue
    for idx, text in enumerate(outputs_text):
        # Si tu prompt incluía el prompt_base al inicio,
        # puedes recortar esa parte si quieres.
        issue_text = text[len(prompt_base):].strip()
        issues[idx]["historia_usuario"] = issue_text

    # Guardar el JSON de salida
    with open(json_output_file, "w", encoding="utf-8") as f:
        json.dump(issues, f, indent=2, ensure_ascii=False)

    print(f"✅ Archivo generado: {json_output_file}")


In [3]:
# Cargar modelo y tokenizer 
model_name = "deepseek-ai/deepseek-llm-7b-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_8bit=True   # usa 8 bits para ahorrar memoria
)

def generar_historias(json_input_file, json_output_file, prompt_base):
    #Genera historias de usuario a partir de issues en un JSON.
   
    # Leer el archivo de entrada
    with open(json_input_file, "r", encoding="utf-8") as f:
        issues = json.load(f)

    for issue in issues:
        title = issue.get("title", "")
        body = issue.get("body", "")
        state = issue.get("state", "")
        author = issue.get("user", "")
        created = issue.get("created_at", "")
        issue_number = issue.get("number", "")

        # armado del contexto
        context_lines = []
        context_lines.append(f"Issue #{issue_number} reportado por {author} (creado el {created}):\n")
        context_lines.append(f"Título: {title}\n")
        context_lines.append(f"Descripción:\n{body}\n")
        context_lines.append(f"Estado: {state}\n")
        context_lines.append("Comentarios:")

        comments = issue.get("comments", [])
        if comments:
            for comment in comments:
                c_author = comment.get("user", "")
                c_date = comment.get("created_at", "")
                c_body = comment.get("body", "").strip()
                context_lines.append(f"- {c_author} ({c_date}): {c_body}")
        else:
            context_lines.append("- No hay comentarios.")

        # Prompt final
        full_prompt = "\n".join(context_lines) + "\n\n" + prompt_base

        print(f"\n\nGenerando historia de usuario para issue #{issue_number}...")

        # Tokenizar entrada
        inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)

        # Generar texto
        output = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.3,
            do_sample=True
        )

        # Decodificar
        respuesta = tokenizer.decode(output[0], skip_special_tokens=True)

        # Opcional: quitar el prompt inicial si quieres solo el contenido generado
        respuesta_generada = respuesta[len(full_prompt):].strip()

        # Guardar en el dict
        issue["historia_usuario"] = respuesta_generada

    # Guardar resultados
    with open(json_output_file, "w", encoding="utf-8") as f:
        json.dump(issues, f, indent=2, ensure_ascii=False)

    print(f"\n✅ Historias generadas y guardadas en '{json_output_file}'.")


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

In [5]:
# Ejemplo de uso",
prompt_base = "Genera una historia de usuario detallada basada en este issue."
generar_historias("../input/issuescomments-json/issuesComments.json", "issues_con_historias.json", prompt_base)

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2827...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2826...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2825...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2824...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2823...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2822...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2821...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2820...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2819...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2818...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2817...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2816...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2815...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2814...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2813...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2812...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2811...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2810...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2809...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2808...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2807...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2806...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2805...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2804...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2803...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2802...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2801...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2800...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2799...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2798...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2797...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2796...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2795...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2794...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.




Generando historia de usuario para issue #2792...

✅ Historias generadas y guardadas en 'issues_con_historias.json'.


In [6]:
!ls -lh /kaggle/working


total 48K
-rw-r--r-- 1 root root 48K Jul 16 22:36 issues_con_historias.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
