## Declare variables for data routes

In [None]:
DATASET_PATH = '../data/raw/data_carrers.csv'
OUTPUT_PATH = '../data/processed/processed-dataset.csv'

In [None]:
import pandas as pd

# Cargar el dataset
df = pd.read_csv(DATASET_PATH, encoding="UTF-8")

# Mostrar las primeras filas para ver qué datos tenemos
df.head()

## Cleaning data and verify cells

In [None]:
# Comprobar si hay valores nulos
df.isnull().sum()

df = df.dropna()

#eliminar filas duplicadas
df = df.drop_duplicates()

# transformando datos para analisis

In [None]:
categories = df['CARRERA'].unique().tolist()

print(categories)


In [None]:
category_to_index = {category: idx for idx, category in enumerate(categories)}

# Opcional: Preprocesar texto (pasar a minúsculas, eliminar caracteres innecesarios)
def preprocess_text(text):
    text = text.lower()  # Convertir a minúsculas
    text = ''.join(e for e in text if e.isalnum() or e.isspace())  # Eliminar caracteres especiales
    return text

df['TEXTO'] = df['TEXTO'].apply(preprocess_text)
df['LABEL'] = df['CARRERA'].map(category_to_index)

texts = df['TEXTO'].tolist()



In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
from sklearn.model_selection import train_test_split
import accelerate
import transformers

print(transformers.__version__, accelerate.__version__)
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support


tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')


In [None]:
# train_texts, test_texts, train_labels, test_labels = train_test_split(
#     df['TEXTO'].tolist(),
#     df['LABEL'].tolist(),
#     test_size=0.2,
#     random_state=42
# )

# print(train_texts)

# print(test_texts)

# print(train_labels)

# print(test_labels)

# train_encodings = tokenizer(
#     train_texts,
#     truncation=True,
#     padding=True,
#     max_length=128
# )

# test_encodings = tokenizer(
#     test_texts,
#     truncation=True,
#     padding=True,
#     max_length=128
# )

# class CarreraDataset(torch.utils.data.Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels
#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         item['labels'] = torch.tensor(self.labels[idx])
#         return item
#     def __len__(self):
#         return len(self.labels)

# train_dataset = CarreraDataset(train_encodings, train_labels)
# test_dataset = CarreraDataset(test_encodings, test_labels)


# # Argumentos de Entrenamiento Ajustados
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=10,            # Comienza con 10 épocas
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=0,
#     weight_decay=0.01,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     logging_dir='./logs',
#     logging_steps=10,
#     load_best_model_at_end=True,
#     metric_for_best_model="accuracy",
#     learning_rate=2e-5  # Tasa de aprendizaje ajustada
# )

# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
#     acc = accuracy_score(labels, preds)
#     return {
#         'accuracy': acc,
#         'f1': f1,
#         'precision': precision,
#         'recall': recall
#     }

# model = BertForSequenceClassification.from_pretrained(
#     'bert-base-uncased',
#     num_labels=len(categories)
# )


# from sklearn.utils.class_weight import compute_class_weight
# import numpy as np

# class_weights = compute_class_weight(
#     'balanced', 
#     classes=np.unique(train_labels), 
#     y=train_labels
# )
# class_weights = torch.tensor(class_weights, dtype=torch.float)

# # Actualizar la configuración del modelo
# model.config.class_weights = class_weights

# from transformers import EarlyStoppingCallback

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     compute_metrics=compute_metrics,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
# )

# trainer.train()


# # saving models
# # trainer.save_model("./models/bert-base-uncased-carrera")
# # evaluating models
# trainer.evaluate()

labels = df['CARRERA'].map(category_to_index).tolist()  # Mapea las categorías a índices numéricos

print(labels)   

inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

# Obtener las entradas tokenizadas
input_ids = inputs['input_ids']

attention_mask = inputs['attention_mask']

# Convertir las etiquetas (labels) a tensor de PyTorch
labels = torch.tensor(labels)

print('labels tensor',labels)

dataset = TensorDataset(input_ids, attention_mask, labels) # creacion de datasetObject 

# Dividir el dataset en conjunto de entrenamiento y prueba
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Crear DataLoader para cargar los datos
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=16)



from transformers import BertForSequenceClassification, BertConfig

config = BertConfig.from_pretrained('dccuchile/bert-base-spanish-wwm-cased', 
                                    num_labels=len(category_to_index), 
                                    hidden_dropout_prob=0.3, 
                                    attention_probs_dropout_prob=0.3)

# Cargar el modelo preentrenado de BERT para clasificación de secuencias
model = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-cased', config=config)

# Ver el modelo cargado
print(model)

optimizer = AdamW(model.parameters(), lr=2e-5)  # Tasa de aprendizaje 2e-5, comúnmente usada con BERT


In [None]:
# from sklearn.metrics import classification_report

# # Poner el modelo en modo de evaluación
# model.eval()

# # Inicializar las listas para almacenar las predicciones y las etiquetas verdaderas
# predictions = []
# true_labels = []

# # Evaluar el modelo sin calcular gradientes
# with torch.no_grad():
#     for batch in test_dataloader:
#         input_ids, attention_mask, labels = batch
        
#         # Realizar la predicción
#         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#         logits = outputs.logits
        
#         # Obtener la predicción más probable (la que tiene el valor más alto)
#         preds = torch.argmax(logits, dim=1).tolist()
        
#         # Almacenar las predicciones y las etiquetas verdaderas
#         predictions.extend(preds)
#         true_labels.extend(labels.tolist())

# # Mostrar un reporte de clasificación
# print(classification_report(true_labels, predictions, target_names=category_to_index.keys()))


# Entrenamiento del modelo 

In [None]:
# Definir el número de épocas (epochs) para entrenar

# Definir el número de épocas (epochs) y crear un scheduler para el optimizador
epochs =17  # Puedes ajustar el número de épocas según tus necesidades
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
from tqdm import tqdm  # Para mostrar una barra de progreso

# Configurar el modelo en modo de entrenamiento
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.train()

for epoch in range(epochs):
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
        optimizer.zero_grad()
        
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        scheduler.step()
    
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs}, Pérdida Promedio: {avg_loss:.4f}")

## Evaluacion del modelo 

In [None]:
# from sklearn.metrics import classification_report

# # Poner el modelo en modo de evaluación
# model.eval()

# # Inicializar las listas para almacenar las predicciones y las etiquetas verdaderas
# predictions = []
# true_labels = []

# # Evaluar el modelo sin calcular gradientes
# with torch.no_grad():
#     for batch in test_dataloader:
#         input_ids, attention_mask, labels = batch
        
#         # Realizar la predicción
#         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#         logits = outputs.logits
        
#         # Obtener la predicción más probable (la que tiene el valor más alto)
#         preds = torch.argmax(logits, dim=1).tolist()
        
#         # Almacenar las predicciones y las etiquetas verdaderas
#         predictions.extend(preds)
#         true_labels.extend(labels.tolist())

# # Mostrar un reporte de clasificación
# print(classification_report(true_labels, predictions, target_names=category_to_index.keys()))



from sklearn.metrics import classification_report, confusion_matrix
# Evaluación en el conjunto de prueba
model.eval()
test_predictions = []
test_true_labels = []
predictions = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluación en Conjunto de Prueba"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        preds = torch.argmax(logits, dim=1)
        test_predictions.extend(preds.cpu().numpy())
        test_true_labels.extend(labels.cpu().numpy())
        predictions.extend(preds)
        true_labels.extend(labels.tolist())
        

# Generar el reporte de clasificación
from sklearn.metrics import classification_report

target_names = list(category_to_index.keys())
print(classification_report(test_true_labels, test_predictions, target_names=target_names))


# Matriz de confusión
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

cm = confusion_matrix(true_labels, predictions)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=target_names, yticklabels=target_names, cmap='Blues')
plt.ylabel('Etiqueta Verdadera')
plt.xlabel('Predicción')
plt.title('Matriz de Confusión')
plt.show()

In [None]:
from sklearn.metrics import classification_report

model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Generar el reporte de clasificación
target_names = list(category_to_index.keys())
print(classification_report(true_labels, predictions, target_names=target_names))