In [1]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install wget

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import wget
import os

url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'

if not os.path.exists('./cola_public_1.1.zip'):
    wget.download(url, './cola_public_1.1.zip')

In [5]:
# descomprimir
if not os.path.exists('./cola_public/'):
    !unzip cola_public_1.1.zip

In [6]:
import pandas as pd

df = pd.read_csv("./cola_public/raw/in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

print('Número de oraciones de entrenamiento: {:,}\n'.format(df.shape[0]))

# Muestra 10 filas aleatorias de los datos.
df.sample(10)

Número de oraciones de entrenamiento: 8,551



Unnamed: 0,sentence_source,label,label_notes,sentence
892,bc01,1,,Lucy recounted a story to remember because Hol...
2521,l-93,1,,Audrey tiptoed to the door.
3311,l-93,1,,Through the hall there echoed a loud cry.
5207,kl93,1,,Any professional dancer would be able to do it.
231,cj99,0,*,"When Bill smokes, much more does Susan hate him."
4145,ks08,0,*,You are the only person that I can rely on.
4272,ks08,1,,John is eager to please Kim.
2282,l-93,1,,Bees are swarming in the garden.
5314,b_82,1,,"Love her though I may, that won't affect the g..."
888,bc01,1,,Mag Wildwood came to introduce the bartender b...


In [7]:
sentences = df.sentence.values
labels = df.label.values

In [8]:
import torch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("valhalla/bart-large-sst2")

In [9]:
print('Original: ', sentences[0])

print('Tokenized: ', tokenizer.tokenize(sentences[0]))

print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

Original:  Our friends won't buy this analysis, let alone the next one we propose.
Tokenized:  ['Our', 'Ġfriends', 'Ġwon', "'t", 'Ġbuy', 'Ġthis', 'Ġanalysis', ',', 'Ġlet', 'Ġalone', 'Ġthe', 'Ġnext', 'Ġone', 'Ġwe', 'Ġpropose', '.']
Token IDs:  [2522, 964, 351, 75, 907, 42, 1966, 6, 905, 1937, 5, 220, 65, 52, 15393, 4]


In [10]:
# Conviertir en tokens todas las oraciones y asignar los tokens a sus identificaciones de palabras.
input_ids = []
attention_masks = []

# Por cada frase...
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Oración a codificar.
                        add_special_tokens = True, # Agregar '<s> y </s>'
                        max_length = 64,
                        truncation='longest_first',# Rellene y trunque todas las oraciones.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construir a la atención máscaras
                        return_tensors = 'pt',     # Devuelve los tensores de pytorch.
                   )


    # Agregue la oración codificada a la lista.
    input_ids.append(encoded_dict['input_ids'])

    # Y su máscara de atención (simplemente diferencia relleno de no relleno).
    attention_masks.append(encoded_dict['attention_mask'])

# Convierte las listas en tensores.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Imprime la oración 0, ahora como una lista de ID.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])



Original:  Our friends won't buy this analysis, let alone the next one we propose.
Token IDs: tensor([    0,  2522,   964,   351,    75,   907,    42,  1966,     6,   905,
         1937,     5,   220,    65,    52, 15393,     4,     2,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1])


In [11]:
from torch.utils.data import TensorDataset, random_split

# Combine las entradas de entrenamiento en un TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Cree una división de entrenamiento y validacion 90-10.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide el conjunto de datos seleccionando muestras al azar.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} ejemplos de entrenamiento'.format(train_size))
print('{:>5,} ejemplos de validacion'.format(val_size))

7,695 ejemplos de entrenamiento
  856 ejemplos de validacion


In [12]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

# Crear los dataloader para los conjuntos de entrenamiento y validación.
# Tomaremos muestras de entrenamiento en orden aleatorio.
train_dataloader = DataLoader(
            train_dataset,  # Las muestras de entrenamiento.
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

# Para la validación, el orden no importa, así que solo los leeremos secuencialmente.
validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

In [13]:
from transformers import BartForSequenceClassification

model = BartForSequenceClassification.from_pretrained('valhalla/bart-large-sst2')


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [79]:
from tqdm import tqdm
import numpy as np
from transformers import AdamW

def fit(model, dataloader, optimizer, epochs=5):
    model.to(device)
    for epoch in range(1, epochs+1):
        model.train()
        train_loss, train_acc = [], []
        bar = tqdm(train_dataloader)

        for batch in bar:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())
            acc = (labels == torch.argmax(logits, axis=1)).sum().item() / len(labels)
            train_acc.append(acc)
            bar.set_description(f"loss {np.mean(train_loss):.5f} acc {np.mean(train_acc):.5f}")

        bar = tqdm(validation_dataloader)

        val_loss, val_acc = [], []
        model.eval()

        with torch.no_grad():
            for batch in bar:
                input_ids = batch[0].to(device)
                attention_mask = batch[1].to(device)
                labels = batch[2].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits

                val_loss.append(loss.item())
                acc = (labels == torch.argmax(logits, axis=1)).sum().item() / len(labels)
                val_acc.append(acc)
                bar.set_description(f"val_loss {np.mean(val_loss):.5f} val_acc {np.mean(val_acc):.5f}")

        print(f"Epoch {epoch}/{epochs} loss {np.mean(train_loss):.5f} val_loss {np.mean(val_loss):.5f} acc {np.mean(train_acc):.5f} val_acc {np.mean(val_acc):.5f}")

# Hiperparámetros
num_epochs = 5
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# Entrenar el modelo
fit(model, train_dataloader, optimizer, num_epochs)


loss 0.03922 acc 0.98768: 100%|██████████| 241/241 [05:42<00:00,  1.42s/it]
val_loss 0.62566 val_acc 0.84915: 100%|██████████| 27/27 [00:13<00:00,  2.01it/s]


Epoch 1/5 loss 0.03922 val_loss 0.62566 acc 0.98768 val_acc 0.84915


loss 0.02678 acc 0.99196: 100%|██████████| 241/241 [05:41<00:00,  1.42s/it]
val_loss 0.71647 val_acc 0.85802: 100%|██████████| 27/27 [00:13<00:00,  2.01it/s]


Epoch 2/5 loss 0.02678 val_loss 0.71647 acc 0.99196 val_acc 0.85802


loss 0.02822 acc 0.99052: 100%|██████████| 241/241 [05:41<00:00,  1.42s/it]
val_loss 0.82615 val_acc 0.84799: 100%|██████████| 27/27 [00:13<00:00,  2.02it/s]


Epoch 3/5 loss 0.02822 val_loss 0.82615 acc 0.99052 val_acc 0.84799


loss 0.02413 acc 0.99196: 100%|██████████| 241/241 [05:41<00:00,  1.42s/it]
val_loss 0.76890 val_acc 0.85147: 100%|██████████| 27/27 [00:13<00:00,  2.02it/s]


Epoch 4/5 loss 0.02413 val_loss 0.76890 acc 0.99196 val_acc 0.85147


loss 0.02842 acc 0.99015: 100%|██████████| 241/241 [05:41<00:00,  1.42s/it]
val_loss 0.76361 val_acc 0.84684: 100%|██████████| 27/27 [00:13<00:00,  2.02it/s]

Epoch 5/5 loss 0.02842 val_loss 0.76361 acc 0.99015 val_acc 0.84684





In [80]:
import pandas as pd

df2 = pd.read_csv("./cola_public/raw/out_of_domain_dev.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

print('Número de oraciones de prueba: {:,}\n'.format(df2.shape[0]))

sentences2 = df2.sentence.values
labels2 = df2.label.values

Número de oraciones de prueba: 516



In [85]:
import random
def predict_sentence(sentence, model, tokenizer):
    # Preprocesamiento de la frase
    encoded_sentence = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Pasar la frase al modelo
    input_ids = encoded_sentence['input_ids'].to(device)
    attention_mask = encoded_sentence['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

    # Interpretar la salida
    predicted_label = predictions.item()

    #print(predicted_label)

    if predicted_label == 0:
        return "Gramaticalmente incorrecto"
    else:
        return "Gramaticalmente correcto"

# Ejemplo de uso
num = random.randint(0, df2.shape[0])
sentence = sentences2[num]
lb = labels2[num]
print("Resultado esperado: ",lb.item())

prediction = predict_sentence(sentence, model, tokenizer)
print(f"La frase '{sentence}' es: {prediction}")


Resultado esperado:  0
La frase 'TV puts dumb ideas.' es: Gramaticalmente incorrecto
