<a href="https://colab.research.google.com/github/daramireh/textRegression/blob/main/regresionTextosBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Carga de datos
data = pd.read_csv('data.txt', sep='\t')
data.head()

Unnamed: 0,Client,Product,Project,Epic,Feature,ProductBacklog,ProductBacklogState,Task,TaskActivity,TaskDescription,TaskState,AssignedTo,IterationDate,Tareas,Effort
0,KON,KON,KON,KON,KON_Infraestructura,KON_Soporte,Approved,Actualización de manual de instalación: format...,,https://skitconsultingltda.sharepoint.com/Proy...,Done,Oscar Andres Mancera Garzon,6/11/2023,1,20
1,KON,KON,KON,KON,KON_Infraestructura,KON_Soporte,Approved,Actualización de Documentos de maquinas virtua...,,,Done,Rafael Hernan Peroza Arevalo,30/10/2023,1,4
2,KON,KON,KON,KON,KON_Infraestructura,KON_Soporte,Approved,comité de soporte,,,Done,Diana Carolina Nieto Sosa,4/12/2023,1,1
3,KON,KON,KON,KON,KON_Infraestructura,KON_Soporte,Approved,Migración bases de datos BPP KTX y BPP KPF a l...,Deployment,Se requiere migrar las bases de datos de los p...,Done,Carlos Camargo Suan,9/1/2023,1,4
4,KON,KON,KON,KON,KON_Infraestructura,KON_Soporte,Approved,Revision de suscripciones,,,Done,Rafael Hernan Peroza Arevalo,30/10/2023,1,6


In [4]:
# Concatenar las columnas de texto
data['text_features'] = data['ProductBacklog'] + " " + data['Task'] + " " + data['TaskDescription'].fillna('')

In [5]:
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from sklearn.model_selection import train_test_split

In [6]:
# Tokenizador de BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
# Función para tokenizar un conjunto de datos
def tokenize_data(texts, tokenizer, max_len=512):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

In [8]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(data['text_features'], data['Effort'], test_size=0.2)

In [9]:
# Tokenizar los datos de entrenamiento y prueba
train_inputs, train_masks = tokenize_data(X_train, tokenizer)
test_inputs, test_masks = tokenize_data(X_test, tokenizer)

# Convertir los esfuerzos a tensores de PyTorch
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

In [10]:
# Definir un dataset personalizado
class EffortDataset(Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.inputs[idx], self.masks[idx], self.labels[idx]

In [11]:
# Crear DataLoaders
train_dataset = EffortDataset(train_inputs, train_masks, train_labels)
test_dataset = EffortDataset(test_inputs, test_masks, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [12]:
# Definir el modelo
class EffortPredictor(nn.Module):
    def __init__(self):
        super(EffortPredictor, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, 1)  # 768 es la dimensión de los embeddings de BERT

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        dropped = self.dropout(pooled_output)
        return self.linear(dropped)


In [13]:
# Instanciar el modelo, definir la función de pérdida y el optimizador
model = EffortPredictor()
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [14]:
# Configuración inicial
epochs = 3

# Asegúrate de que el modelo esté en el dispositivo correcto (CPU o GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

EffortPredictor(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [None]:

# Ciclo de entrenamiento
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        # Paso 1: Cargar los datos del lote al dispositivo adecuado
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device).float().view(-1, 1)  # Ajuste para la forma esperada de las etiquetas

        # Paso 2: Limpia los gradientes
        optimizer.zero_grad()

        # Paso 3: Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)

        # Paso 4: Calcular la pérdida
        loss = loss_fn(outputs, labels)
        total_loss += loss.item()

        # Paso 5: Backward pass (calcular gradientes)
        loss.backward()

        # Paso 6: Actualizar los parámetros del modelo
        optimizer.step()

    # Mostrar la pérdida promedio después de cada época
    avg_train_loss = total_loss / len(train_loader)
    print(f'Época {epoch+1}/{epochs}, Pérdida promedio en entrenamiento: {avg_train_loss}')

    # Evaluación del modelo en el conjunto de prueba
    model.eval()
    total_eval_loss = 0

    # Desactivar el cálculo de gradientes para la evaluación
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device).float().view(-1, 1)

            outputs = model(input_ids, attention_mask=attention_mask)

            loss = loss_fn(outputs, labels)
            total_eval_loss += loss.item()

    avg_eval_loss = total_eval_loss / len(test_loader)
    print(f'Época {epoch+1}/{epochs}, Pérdida promedio en evaluación: {avg_eval_loss}')