In [1]:
!pip install transformers



In [2]:
from transformers import BertTokenizer

In [3]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import svm
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from imblearn.combine import SMOTETomek
import warnings
warnings.filterwarnings('ignore')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
rutaData = "/content/drive/MyDrive/3º - GCD/2C - LENGUAJE NATURAL Y RECUPERACIÓN DE LA INFORMACIÓN/Prácticas LNR/Práctica 3 (Sesiones 1 a 5) - LNR/"
rutaNumpys = "/content/drive/MyDrive/3º - GCD/2C - LENGUAJE NATURAL Y RECUPERACIÓN DE LA INFORMACIÓN/Prácticas LNR/Práctica 3 (Sesiones 1 a 5) - LNR/02 - Segunda Entrega - Práctica 3/"

rutaDataBorja = "/content/drive/MyDrive/Práctica 3 (Sesiones 1 a 5) - LNR/"
rutaNumpyBorja = "/content/drive/MyDrive/Práctica 3 (Sesiones 1 a 5) - LNR/02 - Segunda Entrega - Práctica 3/"

# Representación con Word Embedding - Ponderada
data_pond = np.loadtxt(rutaNumpyBorja + "rep_pond_w_emb.txt")
# Representación con Word Embedding - Mediana
data_med = np.loadtxt(rutaNumpyBorja + "rep_median_w_emb.txt")

In [6]:
df_detests = pd.read_csv(rutaDataBorja + "train.csv")
extra_Data = pd.DataFrame(df_detests['stereotype']).to_numpy()

In [7]:
df_detests_test = pd.read_csv(rutaDataBorja + "test.csv", sep=";")

In [8]:
tokenizer= BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')
example_text = ['Usando BETO en clases de ciencia de datos de la universidad politécnica.',
"Los estudiantes de este grado son muy aplicados y estudiosos."]
bert_input = tokenizer(example_text,padding='max_length', max_length = 20, 
                       truncation=True, return_tensors="pt")

In [9]:
from transformers import BertModel

In [10]:
example_text = tokenizer.decode(bert_input.input_ids[1])
print(example_text)
# Cargar el modelo pre-entrenado
model = BertModel.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')
# Cambiar el model a modo evaluación
model.eval()

[CLS] los estudiantes de este grado son muy aplicados y estudiosos. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(31002, 768, padding_idx=1)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [11]:
import torch
import numpy as np
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

In [12]:
with torch.no_grad():
    outputs = model(**bert_input)
    # Los modelos Transformers siempre devuelven tuplas.
    # Aquí, el primer elemento se corresponde a los vectores en la salida de la última capa
    # de BETO
    encoded_layers = outputs[0]
    print(encoded_layers.size())
    #Aquí se obtiene el embedding de los tokens CLS para cada texto de entrada
    #Esta representación sirve como un embedding contextual de los textos.
    cls_vector = encoded_layers[:,0,:]
    print(cls_vector.size())
    #Vector asociado al token CLS del primer texto en la entrada.
    cls_vector = cls_vector.cpu().detach().numpy()[0]
    print(len(cls_vector))

torch.Size([2, 20, 768])
torch.Size([2, 768])
768


In [13]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, mode="train"):
        self.mode =mode
        if mode !="train":
            self.labels= np.asarray([0]*len( df['sentence']))
        else:
            self.labels = [int(label) for label in df['stereotype']]
        self.texts = [tokenizer(text,padding='max_length', max_length = 512, truncation
        = True, return_tensors="pt") for text in df['sentence']]
    def classes(self):
        return self.labels 
    def __len__(self):
        return len(self.labels) 
    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])
    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]
    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

# CLASE CLASIFICADOR
class BETOClassifier(nn.Module):
    def __init__(self, dropout=0.3, model_name='dccuchile/bert-base-spanish-wwm-uncased'):
        super(BETOClassifier, self).__init__()
        self.beto = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        #CAPA DE SALIDA
        self.linear = nn.Linear(768, 2)
        #ACTIVACIÓN DE LA CAPA DE SALIDA
        self.relu = nn.ReLU()
    def forward(self, input_id, mask):
        _, pooled_output = self.beto(input_ids = input_id, attention_mask= mask,
        return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

In [14]:
def train(model, train_data, val_data, learning_rate, epochs, batch_size=8):
    train, val = Dataset(train_data), Dataset(val_data)
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size,
    shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=batch_size)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    loss = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)
    if use_cuda:
        model = model.cuda()
        criterion = loss.cuda()
    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)
            output = model(input_id, mask)
            batch_loss = loss(output, train_label)
            total_loss_train += batch_loss.item()
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc
            model.zero_grad()

        batch_loss.backward()
        optimizer.step()
        total_acc_val = 0
        total_loss_val = 0
        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)
                output = model(input_id, mask)
                batch_loss = loss(output, val_label)
                total_loss_val += batch_loss.item()
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        print(
            f'Epochs:{epoch_num+1}|Train Loss:{total_loss_train/len(train_data):.3f} \
            | Train Accuracy: {total_acc_train / len(train_data): .3f} \
            | Val Loss: {total_loss_val / len(val_data): .3f} \
            | Val Accuracy: {total_acc_val / len(val_data): .3f}')

# VALIDATION LOOP
def evaluate(model, test_data, batch_size=8, evaltype=True):
    test = Dataset(test_data, mode="test")
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    total_acc_test = 0
    predict=[]
    out = None
    with torch.no_grad():
        k=0
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)
            output = model(input_id, mask)
            #CONCATENER LAS PREDICCIONES DEL MODELO
            if k == 0:
                out = output
            else:
                out = torch.cat((out, output), 0)
            k+=1
            #SI SE CONOCEN LAS ETIQUETAS DEL TEST SE PUEDE CALCULAR EL ACC, EN OTRO CASO
            #SOLO SE DEBE RETORNAR LA SALIDA DEL MODELO
            if evaltype:
                acc = (output.argmax(dim=1) == test_label).sum().item()
                total_acc_test += acc

    if evaltype:
        print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    return out.argmax(dim=1)

## OPTIMIZACIÓN DE HIPERPARÁMETROS

Vamos a realizar diversas pruebas cambiando los valores de epoch, batch y learning rate con el fin de optimizar estos hiperparámetros y obtener el mejor resultado posible. Como en las anteriores prácticas, nos guiaremos principalmente por el valor del f1 score aunque no despreciaremos el valor de otras métricas. Dado que con la función predeterminada solo podemos observar el accuracy obtenido en train y validación, lo que hemos hecho es realizar la predicción de los datos de validación, sacar el vector correspondiente a las predicciones y compararlos con las observaciones reales. Así, con estos dos vectores disponibles, mediante las funciones de la librería de scikit learn, podemos mostrar la matriz de confusión y algunas métricas más de nuestro interés como son el f1 score.

Desconocemos si es posible realizar esto prescindiendo de scikit learn y realizandolo solo mediante BERT y es por eso que hemos hecho uso de esta biblioteca

Mencionar que se intentó realizar el balanceo de clases mediante la técnica SMOTE pero nos vimos con muchas complicaciones a la hora de pasarle estos nuevos datos a BERT. Al no disponer de la documentación suficiente (pese a una búsqueda exahustiva por internet no encontramos nada decente) para informarnos de donde venía el fallo y ver si podíamos solucionarlo, decidimos seguir adelante con el conjunto sin balancear.


In [15]:
np.random.seed(112)
from sklearn.model_selection import train_test_split
df_train, df_val = train_test_split(df_detests, test_size=0.2)
#HIPERPARAMETROS
EPOCHS = 1
BATCH = 8
model = BETOClassifier()
LR = 1e-6

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

In [16]:
bert_8cap = train(model, df_train, df_val, LR, EPOCHS, BATCH)

100%|██████████| 382/382 [03:11<00:00,  2.00it/s]


Epochs:1|Train Loss:0.086             | Train Accuracy:  0.552             | Val Loss:  0.086             | Val Accuracy:  0.572


In [17]:
pred_Beto = evaluate(model, df_val, BATCH, False)

In [18]:
predicc=pred_Beto.tolist()
from sklearn.metrics import confusion_matrix
confusion_matrix(df_val['stereotype'], predicc)



array([[369, 225],
       [105,  65]])

In [19]:
from sklearn.metrics import classification_report
print(classification_report(df_val['stereotype'], predicc))

              precision    recall  f1-score   support

           0       0.78      0.62      0.69       594
           1       0.22      0.38      0.28       170

    accuracy                           0.57       764
   macro avg       0.50      0.50      0.49       764
weighted avg       0.66      0.57      0.60       764



### PRUEBA CON 9 BATCHS


In [20]:
np.random.seed(112)
from sklearn.model_selection import train_test_split
#HIPERPARAMETROS
EPOCHS = 1
BATCH = 9
model9 = BETOClassifier()
LR = 1e-6
bert_9cap = train(model9, df_train, df_val, LR, EPOCHS, BATCH)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

Epochs:1|Train Loss:0.079             | Train Accuracy:  0.548             | Val Loss:  0.078             | Val Accuracy:  0.577


In [21]:
pred_Beto9 = evaluate(model9, df_val, BATCH, False)

In [22]:
predicc9=pred_Beto9.tolist()
from sklearn.metrics import confusion_matrix
confusion_matrix(df_val['stereotype'], predicc9)


array([[393, 201],
       [103,  67]])

In [23]:
from sklearn.metrics import classification_report
print(classification_report(df_val['stereotype'], predicc9))

              precision    recall  f1-score   support

           0       0.79      0.66      0.72       594
           1       0.25      0.39      0.31       170

    accuracy                           0.60       764
   macro avg       0.52      0.53      0.51       764
weighted avg       0.67      0.60      0.63       764



### Prueba con 8 batchs y 2 epochs.

In [24]:
np.random.seed(112)
from sklearn.model_selection import train_test_split
#HIPERPARAMETROS
EPOCHS = 2
BATCH = 8
model12 = BETOClassifier()
LR = 1e-6
bert_12cap = train(model12, df_train, df_val, LR, EPOCHS, BATCH)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

Epochs:1|Train Loss:0.088             | Train Accuracy:  0.515             | Val Loss:  0.087             | Val Accuracy:  0.547


100%|██████████| 382/382 [03:12<00:00,  1.98it/s]


Epochs:2|Train Loss:0.087             | Train Accuracy:  0.527             | Val Loss:  0.087             | Val Accuracy:  0.558


In [25]:
pred_Beto12 = evaluate(model12, df_val, BATCH, False)
predicc12=pred_Beto12.tolist()
from sklearn.metrics import confusion_matrix
print(confusion_matrix(df_val['stereotype'], predicc12))
from sklearn.metrics import classification_report
print(classification_report(df_val['stereotype'], predicc12))

[[355 239]
 [ 89  81]]
              precision    recall  f1-score   support

           0       0.80      0.60      0.68       594
           1       0.25      0.48      0.33       170

    accuracy                           0.57       764
   macro avg       0.53      0.54      0.51       764
weighted avg       0.68      0.57      0.61       764



### 8 batchs y 3 epochs

In [26]:
np.random.seed(112)
from sklearn.model_selection import train_test_split
#HIPERPARAMETROS
EPOCHS = 3
BATCH = 8
model12 = BETOClassifier()
LR = 1e-6
bert_12cap = train(model12, df_train, df_val, LR, EPOCHS, BATCH)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

Epochs:1|Train Loss:0.085             | Train Accuracy:  0.628             | Val Loss:  0.085             | Val Accuracy:  0.611


100%|██████████| 382/382 [03:08<00:00,  2.02it/s]


Epochs:2|Train Loss:0.085             | Train Accuracy:  0.609             | Val Loss:  0.085             | Val Accuracy:  0.620


100%|██████████| 382/382 [03:09<00:00,  2.02it/s]


Epochs:3|Train Loss:0.084             | Train Accuracy:  0.632             | Val Loss:  0.084             | Val Accuracy:  0.666


In [27]:
pred_Beto12 = evaluate(model12, df_val, BATCH, False)
predicc12=pred_Beto12.tolist()
from sklearn.metrics import confusion_matrix
print(confusion_matrix(df_val['stereotype'], predicc12))
from sklearn.metrics import classification_report
print(classification_report(df_val['stereotype'], predicc12))

[[457 137]
 [109  61]]
              precision    recall  f1-score   support

           0       0.81      0.77      0.79       594
           1       0.31      0.36      0.33       170

    accuracy                           0.68       764
   macro avg       0.56      0.56      0.56       764
weighted avg       0.70      0.68      0.69       764



A la vista de los resultados, vemos que los 3 modelos obtienen resultados bastante pobres. La potencia de cálculo de nuestros ordenadores nos impide aumentar el número de epochs y batch (cuando superamos 3 epochs y 8 batch, el sistema nos devuelve un error) por lo que el mejor modelo que podemos obtener en las condiciones en las que nos encontramos es el de 3 epochs y 8 batchs. Ahora variaremos el learning rate para ver si conseguimos unos mejores resultados.

## Variando el learning rate. 

### Learning rate = 1e-8

In [28]:
np.random.seed(112)
from sklearn.model_selection import train_test_split
#HIPERPARAMETROS
EPOCHS = 3
BATCH = 8
model12 = BETOClassifier()
LR = 1e-8
bert_12cap = train(model12, df_train, df_val, LR, EPOCHS, BATCH)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

Epochs:1|Train Loss:0.087             | Train Accuracy:  0.519             | Val Loss:  0.088             | Val Accuracy:  0.505


100%|██████████| 382/382 [03:13<00:00,  1.97it/s]


Epochs:2|Train Loss:0.087             | Train Accuracy:  0.520             | Val Loss:  0.088             | Val Accuracy:  0.490


100%|██████████| 382/382 [03:13<00:00,  1.97it/s]


Epochs:3|Train Loss:0.087             | Train Accuracy:  0.513             | Val Loss:  0.087             | Val Accuracy:  0.533


In [29]:
pred_Beto12 = evaluate(model12, df_val, BATCH, False)
predicc12=pred_Beto12.tolist()
from sklearn.metrics import confusion_matrix
print(confusion_matrix(df_val['stereotype'], predicc12))
from sklearn.metrics import classification_report
print(classification_report(df_val['stereotype'], predicc12))

[[285 309]
 [ 68 102]]
              precision    recall  f1-score   support

           0       0.81      0.48      0.60       594
           1       0.25      0.60      0.35       170

    accuracy                           0.51       764
   macro avg       0.53      0.54      0.48       764
weighted avg       0.68      0.51      0.55       764



### Learning rate 1e10

In [30]:
np.random.seed(112)
from sklearn.model_selection import train_test_split
#HIPERPARAMETROS
EPOCHS = 3
BATCH = 8
model12 = BETOClassifier()
LR = 1e-10
bert_12cap = train(model12, df_train, df_val, LR, EPOCHS, BATCH)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

Epochs:1|Train Loss:0.081             | Train Accuracy:  0.713             | Val Loss:  0.080             | Val Accuracy:  0.738


100%|██████████| 382/382 [03:13<00:00,  1.97it/s]


Epochs:2|Train Loss:0.081             | Train Accuracy:  0.722             | Val Loss:  0.080             | Val Accuracy:  0.741


100%|██████████| 382/382 [03:12<00:00,  1.98it/s]


Epochs:3|Train Loss:0.081             | Train Accuracy:  0.718             | Val Loss:  0.081             | Val Accuracy:  0.712


In [31]:
pred_Beto12 = evaluate(model12, df_val, BATCH, False)
predicc12=pred_Beto12.tolist()
from sklearn.metrics import confusion_matrix
print(confusion_matrix(df_val['stereotype'], predicc12))
from sklearn.metrics import classification_report
print(classification_report(df_val['stereotype'], predicc12))

[[528  66]
 [151  19]]
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       594
           1       0.22      0.11      0.15       170

    accuracy                           0.72       764
   macro avg       0.50      0.50      0.49       764
weighted avg       0.65      0.72      0.68       764



### Learning rate = 1e12

In [32]:
np.random.seed(112)
from sklearn.model_selection import train_test_split
#HIPERPARAMETROS
EPOCHS = 3
BATCH = 8
model12 = BETOClassifier()
LR = 1e-12
bert_12cap = train(model12, df_train, df_val, LR, EPOCHS, BATCH)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

Epochs:1|Train Loss:0.078             | Train Accuracy:  0.761             | Val Loss:  0.078             | Val Accuracy:  0.770


100%|██████████| 382/382 [03:07<00:00,  2.04it/s]


Epochs:2|Train Loss:0.078             | Train Accuracy:  0.760             | Val Loss:  0.078             | Val Accuracy:  0.767


100%|██████████| 382/382 [03:07<00:00,  2.03it/s]


Epochs:3|Train Loss:0.078             | Train Accuracy:  0.760             | Val Loss:  0.078             | Val Accuracy:  0.772


In [33]:
pred_Beto12 = evaluate(model12, df_val, BATCH, False)
predicc12=pred_Beto12.tolist()
from sklearn.metrics import confusion_matrix
print(confusion_matrix(df_val['stereotype'], predicc12))
from sklearn.metrics import classification_report
print(classification_report(df_val['stereotype'], predicc12))

[[588   6]
 [168   2]]
              precision    recall  f1-score   support

           0       0.78      0.99      0.87       594
           1       0.25      0.01      0.02       170

    accuracy                           0.77       764
   macro avg       0.51      0.50      0.45       764
weighted avg       0.66      0.77      0.68       764



Como vemos, conforme aumentamos el learning rate, lo que hace el modelo es ignorar la clase 1, prediciendo prácticamente todas las observaciones como clase 0. Es por eso que obtenemos un accuracy tan relativamente elevado. Pero, fijándono en la matriz de confusión, vemos que las métricas para la clase 1 son rídiculas, por lo que concluimos que este modelo es inutilizable. Una posible mejora sería balancear ambas clases pero como hemos comentado antes, se realizaron varias pruebas sin éxito para esta labor.

