In [None]:
# https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne
!pip install torchinfo

In [None]:

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

In [None]:
df = pd.read_csv('df_balanced.csv')
texts = df['Text'].tolist()
labels = df[['libertad_personal_score', 'libertad_economica_score']].values


In [None]:
# Dividir los datos en conjuntos de entrenamiento y prueba
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [None]:
# Clase personalizada de Dataset
class LibertadDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

In [None]:
import torch
from torch import nn
from transformers import BertTokenizerFast, BertForSequenceClassification
from torchinfo import summary

class LibertadPredictor(nn.Module):
    def __init__(self, num_labels=2):
        super(LibertadPredictor, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased", num_labels=num_labels)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

def print_model_architecture(model):
    # Imprimir la arquitectura del modelo
    print(model)

    # Imprimir un resumen más detallado usando torchinfo
    summary(model, input_size=(1, 128), dtypes=[torch.long, torch.long])



In [None]:
# Inicializar el modelo
tokenizer = BertTokenizerFast.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
model = LibertadPredictor(num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
# Contar y mostrar el número de parámetros
num_params = count_parameters(model)
print(f"\nNúmero total de parámetros entrenables: {num_params:,}")



Número total de parámetros entrenables: 109,853,956


In [None]:
# Crear conjuntos de datos y cargadores de datos
train_dataset = LibertadDataset(train_texts, train_labels, tokenizer)
test_dataset = LibertadDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

tokenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/486k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Definir el optimizador
optimizer = AdamW(model.parameters(), lr=2e-5)




In [None]:
# Ejemplo de uso del modelo
input_ids = torch.randint(0, 1000, (1, 128))
attention_mask = torch.ones((1, 128))
outputs = model(input_ids, attention_mask)

IndexError: tuple index out of range

In [None]:
# Función de entrenamiento
def train(model, dataloader, optimizer, device):
    model.train()
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


In [None]:
# Función de evaluación
def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels']
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.round(torch.sigmoid(logits))
            predictions.extend(preds.cpu().numpy())
            actual_labels.extend(labels.numpy())
    return predictions, actual_labels


In [None]:
# Entrenamiento del modelo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 20  # Puedes cambiar esto a 30 si lo deseas

for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    train(model, train_loader, optimizer, device)
    predictions, actual_labels = evaluate(model, test_loader, device)
    accuracy = accuracy_score(actual_labels, np.round(predictions))
    f1 = f1_score(actual_labels, np.round(predictions), average='weighted')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'F1 Score: {f1:.4f}')

Epoch 1/20


TypeError: LibertadPredictor.forward() got an unexpected keyword argument 'labels'

In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Configuración
RANDOM_SEED = 42
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 20
LEARNING_RATE = 2e-5
MODEL_NAME = "dccuchile/bert-base-spanish-wwm-uncased"

# Configurar reproducibilidad
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Cargar datos
df = pd.read_csv('tu_dataset.csv')
texts = df['Text'].tolist()
labels = df[['libertad_personal_score', 'libertad_economica_score']].values

# Dividir datos
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=RANDOM_SEED)

# Inicializar tokenizador y modelo
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Dataset personalizado
class LibertadDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(label)
        }

# Crear datasets y dataloaders
train_dataset = LibertadDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = LibertadDataset(val_texts, val_labels, tokenizer, MAX_LEN)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Preparar entrenamiento
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Funciones de entrenamiento y evaluación
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels']
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.round(torch.sigmoid(logits))
            predictions.extend(preds.cpu().numpy())
            actual_labels.extend(labels.numpy())
    return predictions, actual_labels

# Entrenamiento
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss = train_epoch(model, train_dataloader, optimizer, device)
    print(f"Training loss: {train_loss:.4f}")

    predictions, actual_labels = evaluate(model, val_dataloader, device)
    accuracy = accuracy_score(actual_labels, np.round(predictions))
    f1 = f1_score(actual_labels, np.round(predictions), average='weighted')
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")
    print()

# Guardar el modelo
torch.save(model.state_dict(), 'modelo_libertad_final.pth')

# Imprimir arquitectura y número de parámetros
print(model)
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")