<a href="https://colab.research.google.com/github/francotejada/Automatic-Traceability/blob/main/Prediccion/3_Prediccion_componentes_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers datasets evaluate accelerate openpyxl scikit-learn

In [2]:
#!pip install -U transformers

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from datasets import Dataset
import evaluate
from sklearn.preprocessing import LabelEncoder


In [4]:
# ✅ Leer archivo Excel (reemplaza con la ruta correcta si usas local)
df = pd.read_excel('/content/data_bugzilla.xlsx')


In [5]:
# ✅ Asumimos estructura: A = componente, B = título, C = descripción
df.columns = ['component', 'title', 'description']
df.dropna(inplace=True)

In [6]:
# ✅ Combinar título + descripción como entrada
df['text'] = df['title'].astype(str) + ' ' + df['description'].astype(str)


In [7]:
# ✅ Codificar etiquetas
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['component'])
num_labels = len(label_encoder.classes_)


In [8]:
# ✅ Dividir en train y test
train_df, test_df = train_test_split(df[['text', 'label']], test_size=0.2, random_state=42)

In [9]:
# ✅ Convertir a HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
text_file = open("vocab.txt", "r")
new_tokens = text_file.readlines()
print(new_tokens)
print(len(new_tokens))
text_file.close()

In [None]:
# ✅ Cargar tokenizer y modelo DeBERTa v3
model_checkpoint = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [None]:
print("[ BEFORE ] tokenizer vocab size:", len(tokenizer))
added_tokens = tokenizer.add_tokens(new_tokens)

print("[ AFTER ] tokenizer vocab size:", len(tokenizer))
print()
print('added_tokens:',added_tokens)
print()

# resize the embeddings matrix of the model
model.resize_token_embeddings(len(tokenizer))

In [11]:
# ✅ Tokenización
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

In [12]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
test_dataset = test_dataset.remove_columns(["text"])


Map:   0%|          | 0/5547 [00:00<?, ? examples/s]

Map:   0%|          | 0/1387 [00:00<?, ? examples/s]

In [13]:
# ✅ Métrica de precisión
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [14]:
# ✅ Configuración del entrenamiento con 15 épocas
training_args = TrainingArguments(
    output_dir="./results",
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    weight_decay=0.01
)

In [15]:
# ✅ Inicializar entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [18]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:

# ✅ Entrenamiento
trainer.train()

# ✅ Evaluación
trainer.evaluate()

# ✅ Guardar modelo y clases
model.save_pretrained("./deberta-bugzilla")
tokenizer.save_pretrained("./deberta-bugzilla")
pd.Series(label_encoder.classes_).to_csv("label_classes.csv", index=False)

In [None]:
from torch.cuda import empty_cache
empty_cache()